[llvm] ab4b924 - [X86] X86FixupVectorConstantsPass - attempt to replace full width integer vector constant loads with broadcasts on AVX2+ targets

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Tue May 30 05:18:10 PDT 2023


Author: Simon Pilgrim
Date: 2023-05-30T13:17:26+01:00
New Revision: ab4b924832ce26c21b88d7f82fcf4992ea8906bb

URL: https://github.com/llvm/llvm-project/commit/ab4b924832ce26c21b88d7f82fcf4992ea8906bb
DIFF: https://github.com/llvm/llvm-project/commit/ab4b924832ce26c21b88d7f82fcf4992ea8906bb.diff

LOG: [X86] X86FixupVectorConstantsPass - attempt to replace full width integer vector constant loads with broadcasts on AVX2+ targets

lowerBuildVectorAsBroadcast will not broadcast splat constants in all cases, resulting in a lot of situations where a full width vector load that has failed to fold but is loading splat constant values could use a broadcast load instruction just as cheaply, and save constant pool space.

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86FixupVectorConstants.cpp
    llvm/test/CodeGen/X86/abdu-vector-128.ll
    llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
    llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
    llvm/test/CodeGen/X86/avx2-arith.ll
    llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll
    llvm/test/CodeGen/X86/avx2-shift.ll
    llvm/test/CodeGen/X86/avx2-vector-shifts.ll
    llvm/test/CodeGen/X86/avx512-arith.ll
    llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
    llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
    llvm/test/CodeGen/X86/avx512bw-intrinsics.ll
    llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
    llvm/test/CodeGen/X86/bitcast-vector-bool.ll
    llvm/test/CodeGen/X86/combine-bitselect.ll
    llvm/test/CodeGen/X86/combine-pavg.ll
    llvm/test/CodeGen/X86/combine-sdiv.ll
    llvm/test/CodeGen/X86/combine-shl.ll
    llvm/test/CodeGen/X86/combine-smax.ll
    llvm/test/CodeGen/X86/combine-smin.ll
    llvm/test/CodeGen/X86/combine-sra.ll
    llvm/test/CodeGen/X86/combine-srl.ll
    llvm/test/CodeGen/X86/concat-cast.ll
    llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
    llvm/test/CodeGen/X86/dpbusd_i4.ll
    llvm/test/CodeGen/X86/freeze-vector.ll
    llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
    llvm/test/CodeGen/X86/gfni-rotates.ll
    llvm/test/CodeGen/X86/gfni-shifts.ll
    llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
    llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
    llvm/test/CodeGen/X86/icmp-pow2-diff.ll
    llvm/test/CodeGen/X86/insert-into-constant-vector.ll
    llvm/test/CodeGen/X86/machine-combiner-int-vec.ll
    llvm/test/CodeGen/X86/masked_store_trunc.ll
    llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
    llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
    llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
    llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
    llvm/test/CodeGen/X86/min-legal-vector-width.ll
    llvm/test/CodeGen/X86/movmsk-cmp.ll
    llvm/test/CodeGen/X86/oddshuffles.ll
    llvm/test/CodeGen/X86/paddus.ll
    llvm/test/CodeGen/X86/pmaddubsw.ll
    llvm/test/CodeGen/X86/pmul.ll
    llvm/test/CodeGen/X86/pmulh.ll
    llvm/test/CodeGen/X86/pr37499.ll
    llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll
    llvm/test/CodeGen/X86/prefer-avx256-popcnt.ll
    llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
    llvm/test/CodeGen/X86/psubus.ll
    llvm/test/CodeGen/X86/sadd_sat_vec.ll
    llvm/test/CodeGen/X86/sat-add.ll
    llvm/test/CodeGen/X86/setcc-non-simple-type.ll
    llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
    llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
    llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
    llvm/test/CodeGen/X86/slow-pmulld.ll
    llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
    llvm/test/CodeGen/X86/sshl_sat_vec.ll
    llvm/test/CodeGen/X86/ssub_sat_vec.ll
    llvm/test/CodeGen/X86/uadd_sat_vec.ll
    llvm/test/CodeGen/X86/umax.ll
    llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll
    llvm/test/CodeGen/X86/usub_sat_vec.ll
    llvm/test/CodeGen/X86/vec_anyext.ll
    llvm/test/CodeGen/X86/vec_cmp_uint-128.ll
    llvm/test/CodeGen/X86/vec_int_to_fp.ll
    llvm/test/CodeGen/X86/vec_minmax_uint.ll
    llvm/test/CodeGen/X86/vec_smulo.ll
    llvm/test/CodeGen/X86/vec_uaddo.ll
    llvm/test/CodeGen/X86/vec_umulo.ll
    llvm/test/CodeGen/X86/vec_usubo.ll
    llvm/test/CodeGen/X86/vector-bitreverse.ll
    llvm/test/CodeGen/X86/vector-blend.ll
    llvm/test/CodeGen/X86/vector-fshl-128.ll
    llvm/test/CodeGen/X86/vector-fshl-256.ll
    llvm/test/CodeGen/X86/vector-fshl-512.ll
    llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
    llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
    llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
    llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
    llvm/test/CodeGen/X86/vector-fshr-128.ll
    llvm/test/CodeGen/X86/vector-fshr-256.ll
    llvm/test/CodeGen/X86/vector-fshr-512.ll
    llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
    llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
    llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
    llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
    llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
    llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
    llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
    llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
    llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
    llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
    llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
    llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
    llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
    llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
    llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
    llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
    llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll
    llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll
    llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
    llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
    llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
    llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
    llvm/test/CodeGen/X86/vector-lzcnt-256.ll
    llvm/test/CodeGen/X86/vector-lzcnt-512.ll
    llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll
    llvm/test/CodeGen/X86/vector-popcnt-128.ll
    llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll
    llvm/test/CodeGen/X86/vector-popcnt-256.ll
    llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll
    llvm/test/CodeGen/X86/vector-popcnt-512.ll
    llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
    llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
    llvm/test/CodeGen/X86/vector-reduce-umax.ll
    llvm/test/CodeGen/X86/vector-reduce-umin.ll
    llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
    llvm/test/CodeGen/X86/vector-rotate-128.ll
    llvm/test/CodeGen/X86/vector-rotate-256.ll
    llvm/test/CodeGen/X86/vector-rotate-512.ll
    llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
    llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
    llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
    llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
    llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
    llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
    llvm/test/CodeGen/X86/vector-shift-shl-256.ll
    llvm/test/CodeGen/X86/vector-shift-shl-512.ll
    llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
    llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
    llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
    llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
    llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
    llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
    llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
    llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
    llvm/test/CodeGen/X86/vector-shuffle-combining.ll
    llvm/test/CodeGen/X86/vector-shuffle-v192.ll
    llvm/test/CodeGen/X86/vector-shuffle-v48.ll
    llvm/test/CodeGen/X86/vector-trunc-math.ll
    llvm/test/CodeGen/X86/vector-trunc-packus.ll
    llvm/test/CodeGen/X86/vector-trunc-ssat.ll
    llvm/test/CodeGen/X86/vector-trunc-usat.ll
    llvm/test/CodeGen/X86/vector-trunc.ll
    llvm/test/CodeGen/X86/vector-tzcnt-128.ll
    llvm/test/CodeGen/X86/vector-tzcnt-256.ll
    llvm/test/CodeGen/X86/vector-tzcnt-512.ll
    llvm/test/CodeGen/X86/vector-unsigned-cmp.ll
    llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
    llvm/test/CodeGen/X86/vselect-avx.ll
    llvm/test/CodeGen/X86/vselect-minmax.ll
    llvm/test/CodeGen/X86/vselect-pcmp.ll
    llvm/test/CodeGen/X86/vselect-post-combine.ll
    llvm/test/CodeGen/X86/x86-interleaved-access.ll
    llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
    llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
    llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
index 03e474b9e2e18..161978be7ff62 100644
--- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
+++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
@@ -231,6 +231,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
   unsigned Opc = MI.getOpcode();
   MachineConstantPool *CP = MI.getParent()->getParent()->getConstantPool();
   bool HasDQI = ST->hasDQI();
+  bool HasBWI = ST->hasBWI();
 
   auto ConvertToBroadcast = [&](unsigned OpBcst256, unsigned OpBcst128,
                                 unsigned OpBcst64, unsigned OpBcst32,
@@ -305,6 +306,47 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
         HasDQI ? X86::VBROADCASTF32X8rm : X86::VBROADCASTF64X4rm,
         HasDQI ? X86::VBROADCASTF64X2rm : X86::VBROADCASTF32X4rm,
         X86::VBROADCASTSDZrm, X86::VBROADCASTSSZrm, 0, 0, 1);
+    /* Integer Loads */
+  case X86::VMOVDQArm:
+  case X86::VMOVDQUrm:
+    if (ST->hasAVX2())
+      return ConvertToBroadcast(0, 0, X86::VPBROADCASTQrm, X86::VPBROADCASTDrm,
+                                X86::VPBROADCASTWrm, X86::VPBROADCASTBrm, 1);
+    return false;
+  case X86::VMOVDQAYrm:
+  case X86::VMOVDQUYrm:
+    if (ST->hasAVX2())
+      return ConvertToBroadcast(0, X86::VBROADCASTI128, X86::VPBROADCASTQYrm,
+                                X86::VPBROADCASTDYrm, X86::VPBROADCASTWYrm,
+                                X86::VPBROADCASTBYrm, 1);
+    return false;
+  case X86::VMOVDQA32Z128rm:
+  case X86::VMOVDQA64Z128rm:
+  case X86::VMOVDQU32Z128rm:
+  case X86::VMOVDQU64Z128rm:
+    return ConvertToBroadcast(0, 0, X86::VPBROADCASTQZ128rm,
+                              X86::VPBROADCASTDZ128rm,
+                              HasBWI ? X86::VPBROADCASTWZ128rm : 0,
+                              HasBWI ? X86::VPBROADCASTBZ128rm : 0, 1);
+  case X86::VMOVDQA32Z256rm:
+  case X86::VMOVDQA64Z256rm:
+  case X86::VMOVDQU32Z256rm:
+  case X86::VMOVDQU64Z256rm:
+    return ConvertToBroadcast(
+        0, HasDQI ? X86::VBROADCASTI64X2Z128rm : X86::VBROADCASTI32X4Z256rm,
+        X86::VPBROADCASTQZ256rm, X86::VPBROADCASTDZ256rm,
+        HasBWI ? X86::VPBROADCASTWZ256rm : 0,
+        HasBWI ? X86::VPBROADCASTBZ256rm : 0, 1);
+  case X86::VMOVDQA32Zrm:
+  case X86::VMOVDQA64Zrm:
+  case X86::VMOVDQU32Zrm:
+  case X86::VMOVDQU64Zrm:
+    return ConvertToBroadcast(
+        HasDQI ? X86::VBROADCASTI32X8rm : X86::VBROADCASTI64X4rm,
+        HasDQI ? X86::VBROADCASTI64X2rm : X86::VBROADCASTI32X4rm,
+        X86::VPBROADCASTQZrm, X86::VPBROADCASTDZrm,
+        HasBWI ? X86::VPBROADCASTWZrm : 0, HasBWI ? X86::VPBROADCASTBZrm : 0,
+        1);
   }
 
   // Attempt to find a AVX512 mapping from a full width memory-fold instruction

diff  --git a/llvm/test/CodeGen/X86/abdu-vector-128.ll b/llvm/test/CodeGen/X86/abdu-vector-128.ll
index f57c6e45dc7f4..b10a6fabb05b6 100644
--- a/llvm/test/CodeGen/X86/abdu-vector-128.ll
+++ b/llvm/test/CodeGen/X86/abdu-vector-128.ll
@@ -261,7 +261,7 @@ define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ;
 ; AVX2-LABEL: abd_ext_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -338,7 +338,7 @@ define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind {
 ;
 ; AVX2-LABEL: abd_ext_v2i64_undef:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -508,7 +508,7 @@ define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ;
 ; AVX2-LABEL: abd_minmax_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -675,7 +675,7 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ;
 ; AVX2-LABEL: abd_cmp_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -757,7 +757,7 @@ define <2 x i64> @abd_cmp_v2i64_multiuse_cmp(<2 x i64> %a, <2 x i64> %b) nounwin
 ;
 ; AVX2-LABEL: abd_cmp_v2i64_multiuse_cmp:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2

diff  --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
index 99a15ef81b9aa..d9ce46f202423 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
@@ -1221,7 +1221,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX2-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX2-NEXT:    vpbroadcastd %xmm1, %ymm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
@@ -1346,7 +1346,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX2-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
@@ -1466,7 +1466,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i
 ; AVX2-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
@@ -2695,7 +2696,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
 ; AVX2-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
 ; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX2-NEXT:    vpbroadcastd %xmm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = <0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,u,u,u,0,u,u,u,0,u,u,u,0,u,u,u>
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
 ; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
@@ -2965,7 +2966,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
 ; AVX2-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
 ; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX2-NEXT:    vpbroadcastq %xmm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = <0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,u,u,u,u,u,u,u,0,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
@@ -3244,7 +3245,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
 ; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm1
 ; AVX2-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = <0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
@@ -4834,7 +4836,8 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
 ; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
 ; AVX512BW-FAST:       # %bb.0:
 ; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,7,0,7]
+; AVX512BW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,7,0,7]
+; AVX512BW-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
 ; AVX512BW-FAST-NEXT:    vpermq %zmm0, %zmm1, %zmm1
 ; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -4955,7 +4958,8 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
 ; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
 ; AVX512BW-FAST:       # %bb.0:
 ; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,7,0,7]
+; AVX512BW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,7,0,7]
+; AVX512BW-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
 ; AVX512BW-FAST-NEXT:    vpermq %zmm0, %zmm1, %zmm0
 ; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2]

diff  --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index f64d9aa3280f0..bae04d9dc769b 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -1012,7 +1012,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
 ; AVX2-NEXT:    vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rdx)
@@ -1110,7 +1110,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.e
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rdx)
@@ -1202,7 +1202,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i
 ; AVX2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rdx)
@@ -2142,7 +2143,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
 ; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm1
 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = <0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,u,u,u,0,u,u,u,0,u,u,u,0,u,u,u>
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
 ; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
@@ -2375,7 +2376,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e
 ; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm1
 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = <0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,u,u,u,u,u,u,u,0,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
@@ -2612,7 +2613,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = <0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0

diff  --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll
index e2e5cadf9d465..2dc33d32e9d3a 100644
--- a/llvm/test/CodeGen/X86/avx2-arith.ll
+++ b/llvm/test/CodeGen/X86/avx2-arith.ll
@@ -174,7 +174,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
 ; X32-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; X32-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; X32-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
-; X32-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; X32-NEXT:    vpand %ymm3, %ymm2, %ymm2
 ; X32-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; X32-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -188,7 +188,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
 ; X64-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; X64-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; X64-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
-; X64-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X64-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; X64-NEXT:    vpand %ymm3, %ymm2, %ymm2
 ; X64-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; X64-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]

diff  --git a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll
index 15e2c3890354f..7cf459e566617 100644
--- a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll
@@ -1497,27 +1497,27 @@ define <2 x i64> @test_x86_avx2_psrlv_q(<2 x i64> %a0, <2 x i64> %a1) {
 define <2 x i64> @test_x86_avx2_psrlv_q_const() {
 ; X86-AVX-LABEL: test_x86_avx2_psrlv_q_const:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,0,4,0]
-; X86-AVX-NEXT:    # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
-; X86-AVX-NEXT:    # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-AVX-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [4,0,4,0]
+; X86-AVX-NEXT:    # encoding: [0xc4,0xe2,0x79,0x59,0x05,A,A,A,A]
+; X86-AVX-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-AVX-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # encoding: [0xc4,0xe2,0xf9,0x45,0x05,A,A,A,A]
 ; X86-AVX-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-AVX-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512VL-LABEL: test_x86_avx2_psrlv_q_const:
 ; X86-AVX512VL:       # %bb.0:
-; X86-AVX512VL-NEXT:    vmovdqa {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # EVEX TO VEX Compression xmm0 = [4,0,4,0]
-; X86-AVX512VL-NEXT:    # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT:    # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-AVX512VL-NEXT:    vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # EVEX TO VEX Compression xmm0 = [4,0,4,0]
+; X86-AVX512VL-NEXT:    # encoding: [0xc4,0xe2,0x79,0x59,0x05,A,A,A,A]
+; X86-AVX512VL-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-AVX512VL-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x45,0x05,A,A,A,A]
 ; X86-AVX512VL-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-AVX512VL-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-AVX-LABEL: test_x86_avx2_psrlv_q_const:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,4]
-; X64-AVX-NEXT:    # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
-; X64-AVX-NEXT:    # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [4,4]
+; X64-AVX-NEXT:    # encoding: [0xc4,0xe2,0x79,0x59,0x05,A,A,A,A]
+; X64-AVX-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
 ; X64-AVX-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0xf9,0x45,0x05,A,A,A,A]
 ; X64-AVX-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
 ; X64-AVX-NEXT:    retq # encoding: [0xc3]
@@ -1554,18 +1554,18 @@ define <4 x i64> @test_x86_avx2_psrlv_q_256(<4 x i64> %a0, <4 x i64> %a1) {
 define <4 x i64> @test_x86_avx2_psrlv_q_256_const() {
 ; X86-AVX-LABEL: test_x86_avx2_psrlv_q_256_const:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vmovdqa {{.*#+}} ymm0 = [4,0,4,0,4,0,4,0]
-; X86-AVX-NEXT:    # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
-; X86-AVX-NEXT:    # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-AVX-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [4,0,4,0,4,0,4,0]
+; X86-AVX-NEXT:    # encoding: [0xc4,0xe2,0x7d,0x59,0x05,A,A,A,A]
+; X86-AVX-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-AVX-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # encoding: [0xc4,0xe2,0xfd,0x45,0x05,A,A,A,A]
 ; X86-AVX-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-AVX-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512VL-LABEL: test_x86_avx2_psrlv_q_256_const:
 ; X86-AVX512VL:       # %bb.0:
-; X86-AVX512VL-NEXT:    vmovdqa {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0 # EVEX TO VEX Compression ymm0 = [4,0,4,0,4,0,4,0]
-; X86-AVX512VL-NEXT:    # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT:    # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-AVX512VL-NEXT:    vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0 # EVEX TO VEX Compression ymm0 = [4,0,4,0,4,0,4,0]
+; X86-AVX512VL-NEXT:    # encoding: [0xc4,0xe2,0x7d,0x59,0x05,A,A,A,A]
+; X86-AVX512VL-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-AVX512VL-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x45,0x05,A,A,A,A]
 ; X86-AVX512VL-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-AVX512VL-NEXT:    retl # encoding: [0xc3]

diff  --git a/llvm/test/CodeGen/X86/avx2-shift.ll b/llvm/test/CodeGen/X86/avx2-shift.ll
index 7f163ef266c7a..f70c547958519 100644
--- a/llvm/test/CodeGen/X86/avx2-shift.ll
+++ b/llvm/test/CodeGen/X86/avx2-shift.ll
@@ -377,7 +377,7 @@ define <32 x i8> @sra_v32i8(<32 x i8> %A) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    vpsrlw $3, %ymm0, %ymm0
 ; X86-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
-; X86-NEXT:    vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; X86-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; X86-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; X86-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 ; X86-NEXT:    retl
@@ -386,7 +386,7 @@ define <32 x i8> @sra_v32i8(<32 x i8> %A) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpsrlw $3, %ymm0, %ymm0
 ; X64-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-NEXT:    vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; X64-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; X64-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; X64-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 ; X64-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll
index 7348a8a6db8c7..8fb7c65a9a60b 100644
--- a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll
+++ b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll
@@ -276,7 +276,8 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
 ;
 ; X86-FAST-ALL-LABEL: srl_trunc_and_v4i64:
 ; X86-FAST-ALL:       # %bb.0:
-; X86-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
+; X86-FAST-ALL-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6]
+; X86-FAST-ALL-NEXT:    # ymm2 = mem[0,1,0,1]
 ; X86-FAST-ALL-NEXT:    vpermd %ymm1, %ymm2, %ymm1
 ; X86-FAST-ALL-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
 ; X86-FAST-ALL-NEXT:    vpand %xmm2, %xmm1, %xmm1
@@ -306,7 +307,8 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
 ;
 ; X64-FAST-ALL-LABEL: srl_trunc_and_v4i64:
 ; X64-FAST-ALL:       # %bb.0:
-; X64-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
+; X64-FAST-ALL-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6]
+; X64-FAST-ALL-NEXT:    # ymm2 = mem[0,1,0,1]
 ; X64-FAST-ALL-NEXT:    vpermd %ymm1, %ymm2, %ymm1
 ; X64-FAST-ALL-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
 ; X64-FAST-ALL-NEXT:    vpand %xmm2, %xmm1, %xmm1

diff  --git a/llvm/test/CodeGen/X86/avx512-arith.ll b/llvm/test/CodeGen/X86/avx512-arith.ll
index 5c333da422dcc..25e297993bd7c 100644
--- a/llvm/test/CodeGen/X86/avx512-arith.ll
+++ b/llvm/test/CodeGen/X86/avx512-arith.ll
@@ -293,7 +293,7 @@ define <2 x i64> @imulq128(<2 x i64> %y, <2 x i64> %x) {
 define <2 x i64> @imulq128_bcast(<2 x i64> %x) {
 ; AVX512F-LABEL: imulq128_bcast:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [8086,8086]
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [8086,8086]
 ; AVX512F-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
 ; AVX512F-NEXT:    vpsrlq $32, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
@@ -313,7 +313,7 @@ define <2 x i64> @imulq128_bcast(<2 x i64> %x) {
 ;
 ; AVX512BW-LABEL: imulq128_bcast:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [8086,8086]
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [8086,8086]
 ; AVX512BW-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
 ; AVX512BW-NEXT:    vpsrlq $32, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
@@ -324,7 +324,7 @@ define <2 x i64> @imulq128_bcast(<2 x i64> %x) {
 ; AVX512DQ-LABEL: imulq128_bcast:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [8086,8086]
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [8086,8086]
 ; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512DQ-NEXT:    vzeroupper

diff  --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
index f4c6532e8da35..780abc9f9dc43 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -6775,7 +6775,7 @@ define i64 @test_mm512_mask_reduce_mul_epi64(i8 zeroext %__M, <8 x i64> %__W) {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
-; X86-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]
+; X86-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]
 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; X86-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
 ; X86-NEXT:    vpsrlq $32, %ymm1, %ymm2
@@ -7978,7 +7978,7 @@ define i64 @test_mm512_mask_reduce_max_epi64(i8 zeroext %__M, <8 x i64> %__W) {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
-; X86-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648]
 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; X86-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
 ; X86-NEXT:    vpmaxsq %zmm0, %zmm1, %zmm0
@@ -8128,7 +8128,7 @@ define i64 @test_mm512_mask_reduce_min_epi64(i8 zeroext %__M, <8 x i64> %__W) {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
-; X86-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647]
+; X86-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647]
 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; X86-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
 ; X86-NEXT:    vpminsq %zmm0, %zmm1, %zmm0

diff  --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index cc0da34453eb5..ec0f14ae4e58e 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -1649,7 +1649,8 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32>
 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <15,5,3,2,u,u,u,u>
+; CHECK-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [15,5,3,2,15,5,3,2]
+; CHECK-NEXT:    # ymm3 = mem[0,1,0,1]
 ; CHECK-NEXT:    vpermi2d (%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1}
@@ -1666,7 +1667,8 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32
 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = <15,5,3,2,u,u,u,u>
+; CHECK-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [15,5,3,2,15,5,3,2]
+; CHECK-NEXT:    # ymm1 = mem[0,1,0,1]
 ; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
@@ -1993,7 +1995,8 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask2:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [6,3,6,3]
+; CHECK-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [6,3,6,3]
+; CHECK-FAST-NEXT:    # ymm3 = mem[0,1,0,1]
 ; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm3, %zmm0
 ; CHECK-FAST-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; CHECK-FAST-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
@@ -2016,7 +2019,8 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,3,6,3]
+; CHECK-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [6,3,6,3]
+; CHECK-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
 ; CHECK-FAST-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
 ; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0

diff  --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll
index be7ac666cbbec..17d6266ab7c9e 100644
--- a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -1412,18 +1412,20 @@ define <32 x i16>@test_int_x86_avx512_maskz_psrav32_hi(<32 x i16> %x0, <32 x i16
 define <32 x i16>@test_int_x86_avx512_mask_psrav32_hi_const(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_psrav32_hi_const:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51]
-; X86-NEXT:    # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x05,A,A,A,A]
+; X86-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51]
+; X86-NEXT:    # encoding: [0x62,0xf2,0x7d,0x48,0x5a,0x05,A,A,A,A]
 ; X86-NEXT:    # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; X86-NEXT:    vpsravw {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x11,0x05,A,A,A,A]
 ; X86-NEXT:    # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_psrav32_hi_const:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51]
-; X64-NEXT:    # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x05,A,A,A,A]
+; X64-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51]
+; X64-NEXT:    # encoding: [0x62,0xf2,0x7d,0x48,0x5a,0x05,A,A,A,A]
 ; X64-NEXT:    # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; X64-NEXT:    vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x11,0x05,A,A,A,A]
 ; X64-NEXT:    # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
 ; X64-NEXT:    retq # encoding: [0xc3]

diff  --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
index a08200fde8e78..c22ec12c1b207 100644
--- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
+++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
@@ -842,7 +842,7 @@ define <64 x i8> @ext_i64_64i8(i64 %a0) {
 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $7, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23]
 ; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1

diff  --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
index c092ed4f9f668..8687dae107f24 100644
--- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
+++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
@@ -961,7 +961,7 @@ define i1 @trunc_v32i16_cmp(<32 x i16> %a0) nounwind {
 ;
 ; AVX512-LABEL: trunc_v32i16_cmp:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512-NEXT:    vpbroadcastw {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
 ; AVX512-NEXT:    kortestw %k0, %k0

diff  --git a/llvm/test/CodeGen/X86/combine-bitselect.ll b/llvm/test/CodeGen/X86/combine-bitselect.ll
index 23f56d908cb2e..6a0dc4ae0695d 100644
--- a/llvm/test/CodeGen/X86/combine-bitselect.ll
+++ b/llvm/test/CodeGen/X86/combine-bitselect.ll
@@ -377,7 +377,8 @@ define <4 x i64> @bitselect_v4i64_rm(<4 x i64>, ptr nocapture readonly) {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612]
+; AVX512F-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpternlogq $184, %zmm1, %zmm2, %zmm0
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
@@ -429,7 +430,8 @@ define <4 x i64> @bitselect_v4i64_mr(ptr nocapture readonly, <4 x i64>) {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [12884901890,4294967296,12884901890,4294967296]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [12884901890,4294967296,12884901890,4294967296]
+; AVX512F-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpternlogq $184, %zmm1, %zmm2, %zmm0
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
@@ -481,7 +483,8 @@ define <4 x i64> @bitselect_v4i64_mm(ptr nocapture readonly, ptr nocapture reado
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm1
 ; AVX512F-NEXT:    vmovdqa (%rsi), %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022]
+; AVX512F-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpternlogq $226, %zmm1, %zmm2, %zmm0
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
@@ -489,7 +492,8 @@ define <4 x i64> @bitselect_v4i64_mm(ptr nocapture readonly, ptr nocapture reado
 ; AVX512VL-LABEL: bitselect_v4i64_mm:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vmovdqa (%rsi), %ymm1
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm0 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022]
+; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022]
+; AVX512VL-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX512VL-NEXT:    vpternlogq $202, (%rdi), %ymm1, %ymm0
 ; AVX512VL-NEXT:    retq
   %3 = load <4 x i64>, ptr %0
@@ -849,7 +853,8 @@ define <8 x i64> @bitselect_v8i64_mm(ptr nocapture readonly, ptr nocapture reado
 ; AVX512-LABEL: bitselect_v8i64_mm:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovdqa64 (%rsi), %zmm1
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022]
+; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022]
+; AVX512-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512-NEXT:    vpternlogq $202, (%rdi), %zmm1, %zmm0
 ; AVX512-NEXT:    retq
   %3 = load <8 x i64>, ptr %0

diff  --git a/llvm/test/CodeGen/X86/combine-pavg.ll b/llvm/test/CodeGen/X86/combine-pavg.ll
index a8923210427e5..4a3b765924513 100644
--- a/llvm/test/CodeGen/X86/combine-pavg.ll
+++ b/llvm/test/CodeGen/X86/combine-pavg.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
 
 declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
 declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
@@ -31,17 +31,29 @@ define <16 x i8> @combine_pavgw_knownbits(<8 x i16> %a0, <8 x i16> %a1, <8 x i16
 ; SSE-NEXT:    packuswb %xmm3, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: combine_pavgw_knownbits:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [31,31,31,31,31,31,31,31]
-; AVX-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; AVX-NEXT:    vpand %xmm4, %xmm1, %xmm1
-; AVX-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpand %xmm4, %xmm2, %xmm1
-; AVX-NEXT:    vpand %xmm4, %xmm3, %xmm2
-; AVX-NEXT:    vpavgw %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: combine_pavgw_knownbits:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [31,31,31,31,31,31,31,31]
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm1
+; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm2
+; AVX1-NEXT:    vpavgw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_pavgw_knownbits:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [31,31,31,31,31,31,31,31]
+; AVX2-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm4, %xmm1, %xmm1
+; AVX2-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm4, %xmm2, %xmm1
+; AVX2-NEXT:    vpand %xmm4, %xmm3, %xmm2
+; AVX2-NEXT:    vpavgw %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
   %m0 = and <8 x i16> %a0, <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>
   %m1 = and <8 x i16> %a1, <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>
   %m2 = and <8 x i16> %a2, <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>

diff  --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index bcdcfdd714784..8ede23616d65e 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -421,7 +421,7 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) {
 ; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -437,7 +437,7 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) {
 ; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; AVX512F-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
 ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
 ; AVX512F-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll
index e443e8472f31f..a05da63e43e12 100644
--- a/llvm/test/CodeGen/X86/combine-shl.ll
+++ b/llvm/test/CodeGen/X86/combine-shl.ll
@@ -148,7 +148,8 @@ define <4 x i32> @combine_vec_shl_trunc_and(<4 x i32> %x, <4 x i64> %y) {
 ;
 ; AVX-FAST-ALL-LABEL: combine_vec_shl_trunc_and:
 ; AVX-FAST-ALL:       # %bb.0:
-; AVX-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
+; AVX-FAST-ALL-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6]
+; AVX-FAST-ALL-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX-FAST-ALL-NEXT:    vpermd %ymm1, %ymm2, %ymm1
 ; AVX-FAST-ALL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX-FAST-ALL-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0

diff  --git a/llvm/test/CodeGen/X86/combine-smax.ll b/llvm/test/CodeGen/X86/combine-smax.ll
index efaeb97b89d65..828a36d4450ae 100644
--- a/llvm/test/CodeGen/X86/combine-smax.ll
+++ b/llvm/test/CodeGen/X86/combine-smax.ll
@@ -2,10 +2,10 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE42
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2
 
 define <16 x i8> @test_v16i8_nosignbit(<16 x i8> %a, <16 x i8> %b) {
 ; SSE2-LABEL: test_v16i8_nosignbit:
@@ -32,13 +32,21 @@ define <16 x i8> @test_v16i8_nosignbit(<16 x i8> %a, <16 x i8> %b) {
 ; SSE42-NEXT:    pmaxsb %xmm2, %xmm0
 ; SSE42-NEXT:    retq
 ;
-; AVX-LABEL: test_v16i8_nosignbit:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: test_v16i8_nosignbit:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i8_nosignbit:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
   %1 = and <16 x i8> %a, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
   %2 = and <16 x i8> %b, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
   %3 = icmp sgt <16 x i8> %1, %2

diff  --git a/llvm/test/CodeGen/X86/combine-smin.ll b/llvm/test/CodeGen/X86/combine-smin.ll
index b228b153f6eef..6a69bd06d85e0 100644
--- a/llvm/test/CodeGen/X86/combine-smin.ll
+++ b/llvm/test/CodeGen/X86/combine-smin.ll
@@ -2,10 +2,10 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE42
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2
 
 define <16 x i8> @test_v16i8_nosignbit(<16 x i8> %a, <16 x i8> %b) {
 ; SSE2-LABEL: test_v16i8_nosignbit:
@@ -32,13 +32,21 @@ define <16 x i8> @test_v16i8_nosignbit(<16 x i8> %a, <16 x i8> %b) {
 ; SSE42-NEXT:    pminsb %xmm2, %xmm0
 ; SSE42-NEXT:    retq
 ;
-; AVX-LABEL: test_v16i8_nosignbit:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: test_v16i8_nosignbit:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i8_nosignbit:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
   %1 = and <16 x i8> %a, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
   %2 = and <16 x i8> %b, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
   %3 = icmp slt <16 x i8> %1, %2

diff  --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll
index 70e0128629898..db37db7ec1be5 100644
--- a/llvm/test/CodeGen/X86/combine-sra.ll
+++ b/llvm/test/CodeGen/X86/combine-sra.ll
@@ -178,7 +178,8 @@ define <4 x i32> @combine_vec_ashr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
 ;
 ; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_and:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm1, %ymm2, %ymm1
 ; AVX2-FAST-ALL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX2-FAST-ALL-NEXT:    vpsravd %xmm1, %xmm0, %xmm0

diff  --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll
index d2704e8f7af3a..5c69fe9055971 100644
--- a/llvm/test/CodeGen/X86/combine-srl.ll
+++ b/llvm/test/CodeGen/X86/combine-srl.ll
@@ -214,7 +214,8 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr1(<4 x i64> %x) {
 ; AVX2-FAST-ALL-LABEL: combine_vec_lshr_trunc_lshr1:
 ; AVX2-FAST-ALL:       # %bb.0:
 ; AVX2-FAST-ALL-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-ALL-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX2-FAST-ALL-NEXT:    vzeroupper
@@ -422,7 +423,8 @@ define <4 x i32> @combine_vec_lshr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
 ;
 ; AVX2-FAST-ALL-LABEL: combine_vec_lshr_trunc_and:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm1, %ymm2, %ymm1
 ; AVX2-FAST-ALL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX2-FAST-ALL-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0

diff  --git a/llvm/test/CodeGen/X86/concat-cast.ll b/llvm/test/CodeGen/X86/concat-cast.ll
index a542ab95c76b6..b898be5941ed1 100644
--- a/llvm/test/CodeGen/X86/concat-cast.ll
+++ b/llvm/test/CodeGen/X86/concat-cast.ll
@@ -373,7 +373,7 @@ define <4 x float> @mismatch_tofp_v4i32_v4f32(<2 x i32> %x, <2 x i32> %y) {
 ; AVX2-LABEL: mismatch_tofp_v4i32_v4f32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15]
 ; AVX2-NEXT:    vpor %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vsubpd %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vcvtpd2ps %xmm0, %xmm0

diff  --git a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
index edfcd94b43ae6..1baaab0931cb9 100644
--- a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
+++ b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
@@ -788,7 +788,8 @@ define <32 x i8> @vec256_eltty_i8_source_subvec_0_target_subvec_mask_3_unary(<32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1]
 ; CHECK-NEXT:    vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16]
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; CHECK-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; CHECK-NEXT:    # ymm2 = mem[0,1,0,1]
 ; CHECK-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %r = shufflevector <32 x i8> %x, <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 0, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 0>
@@ -800,7 +801,8 @@ define <32 x i8> @vec256_eltty_i8_source_subvec_0_target_subvec_mask_3_binary(<3
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
 ; CHECK-NEXT:    vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16]
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; CHECK-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; CHECK-NEXT:    # ymm2 = mem[0,1,0,1]
 ; CHECK-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %r = shufflevector <32 x i8> %x, <32 x i8> %y, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 32, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 32>
@@ -857,7 +859,8 @@ define <32 x i8> @vec256_eltty_i8_source_subvec_1_target_subvec_mask_3_unary(<32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
 ; CHECK-NEXT:    vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16]
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; CHECK-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; CHECK-NEXT:    # ymm2 = mem[0,1,0,1]
 ; CHECK-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %r = shufflevector <32 x i8> %x, <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 16>
@@ -869,7 +872,8 @@ define <32 x i8> @vec256_eltty_i8_source_subvec_1_target_subvec_mask_3_binary(<3
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
 ; CHECK-NEXT:    vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16]
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; CHECK-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; CHECK-NEXT:    # ymm2 = mem[0,1,0,1]
 ; CHECK-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %r = shufflevector <32 x i8> %x, <32 x i8> %y, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 48, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 48>

diff  --git a/llvm/test/CodeGen/X86/dpbusd_i4.ll b/llvm/test/CodeGen/X86/dpbusd_i4.ll
index 2e25b5a0e1c03..906fead7f8db5 100644
--- a/llvm/test/CodeGen/X86/dpbusd_i4.ll
+++ b/llvm/test/CodeGen/X86/dpbusd_i4.ll
@@ -53,7 +53,7 @@ define i32 @mul_i4i4(<16 x i4> %a, <16 x i4> %b, i32 %c) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vpsllw $4, %xmm1, %xmm1
 ; CHECK-NEXT:    vpsrlw $4, %xmm1, %xmm1
-; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; CHECK-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1
 ; CHECK-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
 ; CHECK-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
@@ -107,7 +107,7 @@ entry:
 define i32 @mul_zext_i4i4(<16 x i4> %a, <16 x i4> %b, i32 %c) {
 ; CHECK-LABEL: mul_zext_i4i4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; CHECK-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; CHECK-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2

diff  --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll
index 93d6a9f3fc9a5..d4dd264bfb5ef 100644
--- a/llvm/test/CodeGen/X86/freeze-vector.ll
+++ b/llvm/test/CodeGen/X86/freeze-vector.ll
@@ -288,7 +288,7 @@ define void @freeze_buildvector_single_maybe_poison_operand(ptr %origin, ptr %ds
 ;
 ; X64-LABEL: freeze_buildvector_single_maybe_poison_operand:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,42,42,42>
+; X64-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [42,42,42,42]
 ; X64-NEXT:    vpinsrd $0, (%rdi), %xmm0, %xmm0
 ; X64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7]
 ; X64-NEXT:    vpand %xmm1, %xmm0, %xmm0
@@ -322,7 +322,7 @@ define void @freeze_buildvector_single_repeated_maybe_poison_operand(ptr %origin
 ;
 ; X64-LABEL: freeze_buildvector_single_repeated_maybe_poison_operand:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,42,u,u>
+; X64-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [42,42,42,42]
 ; X64-NEXT:    vpinsrd $0, (%rdi), %xmm0, %xmm0
 ; X64-NEXT:    vpbroadcastq %xmm0, %xmm0
 ; X64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7]

diff  --git a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
index 74eb3a56ef672..6fe3de87f9827 100644
--- a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
@@ -238,7 +238,7 @@ define <64 x i8> @splatconstant_fshl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind
 ; GFNIAVX2-LABEL: splatconstant_fshl_v64i8:
 ; GFNIAVX2:       # %bb.0:
 ; GFNIAVX2-NEXT:    vpsrlw $7, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; GFNIAVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
@@ -323,7 +323,7 @@ define <64 x i8> @splatconstant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind
 ; GFNIAVX2-LABEL: splatconstant_fshr_v64i8:
 ; GFNIAVX2:       # %bb.0:
 ; GFNIAVX2-NEXT:    vpsrlw $2, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192]
+; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm4 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192]
 ; GFNIAVX2-NEXT:    vpandn %ymm2, %ymm4, %ymm2
 ; GFNIAVX2-NEXT:    vpsllw $6, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0

diff  --git a/llvm/test/CodeGen/X86/gfni-rotates.ll b/llvm/test/CodeGen/X86/gfni-rotates.ll
index 7b79b02751164..ff9dc24e65a64 100644
--- a/llvm/test/CodeGen/X86/gfni-rotates.ll
+++ b/llvm/test/CodeGen/X86/gfni-rotates.ll
@@ -242,7 +242,7 @@ define <64 x i8> @splatconstant_rotl_v64i8(<64 x i8> %a) nounwind {
 ; GFNIAVX2-LABEL: splatconstant_rotl_v64i8:
 ; GFNIAVX2:       # %bb.0:
 ; GFNIAVX2-NEXT:    vpsrlw $7, %ymm0, %ymm2
-; GFNIAVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; GFNIAVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
@@ -328,7 +328,7 @@ define <64 x i8> @splatconstant_rotr_v64i8(<64 x i8> %a) nounwind {
 ; GFNIAVX2-LABEL: splatconstant_rotr_v64i8:
 ; GFNIAVX2:       # %bb.0:
 ; GFNIAVX2-NEXT:    vpsrlw $2, %ymm0, %ymm2
-; GFNIAVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192]
+; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192]
 ; GFNIAVX2-NEXT:    vpandn %ymm2, %ymm3, %ymm2
 ; GFNIAVX2-NEXT:    vpsllw $6, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0

diff  --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll
index d5ed003c45092..14efd6ab1f6c8 100644
--- a/llvm/test/CodeGen/X86/gfni-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-shifts.ll
@@ -62,19 +62,28 @@ define <16 x i8> @splatconstant_ashr_v16i8(<16 x i8> %a) nounwind {
 ; GFNISSE-NEXT:    psubb %xmm1, %xmm0
 ; GFNISSE-NEXT:    retq
 ;
-; GFNIAVX1OR2-LABEL: splatconstant_ashr_v16i8:
-; GFNIAVX1OR2:       # %bb.0:
-; GFNIAVX1OR2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; GFNIAVX1OR2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT:    retq
+; GFNIAVX1-LABEL: splatconstant_ashr_v16i8:
+; GFNIAVX1:       # %bb.0:
+; GFNIAVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; GFNIAVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    retq
+;
+; GFNIAVX2-LABEL: splatconstant_ashr_v16i8:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; GFNIAVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; GFNIAVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; GFNIAVX2-NEXT:    retq
 ;
 ; GFNIAVX512-LABEL: splatconstant_ashr_v16i8:
 ; GFNIAVX512:       # %bb.0:
 ; GFNIAVX512-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; GFNIAVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; GFNIAVX512-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; GFNIAVX512-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
 ; GFNIAVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; GFNIAVX512-NEXT:    retq
@@ -193,7 +202,7 @@ define <32 x i8> @splatconstant_ashr_v32i8(<32 x i8> %a) nounwind {
 ; GFNIAVX2:       # %bb.0:
 ; GFNIAVX2-NEXT:    vpsrlw $2, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
 ; GFNIAVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    retq
@@ -201,7 +210,7 @@ define <32 x i8> @splatconstant_ashr_v32i8(<32 x i8> %a) nounwind {
 ; GFNIAVX512-LABEL: splatconstant_ashr_v32i8:
 ; GFNIAVX512:       # %bb.0:
 ; GFNIAVX512-NEXT:    vpsrlw $2, %ymm0, %ymm0
-; GFNIAVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; GFNIAVX512-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
 ; GFNIAVX512-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
 ; GFNIAVX512-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 ; GFNIAVX512-NEXT:    retq
@@ -247,7 +256,7 @@ define <64 x i8> @splatconstant_shl_v64i8(<64 x i8> %a) nounwind {
 ; GFNIAVX2-LABEL: splatconstant_shl_v64i8:
 ; GFNIAVX2:       # %bb.0:
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224]
+; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224]
 ; GFNIAVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -296,7 +305,7 @@ define <64 x i8> @splatconstant_lshr_v64i8(<64 x i8> %a) nounwind {
 ; GFNIAVX2-LABEL: splatconstant_lshr_v64i8:
 ; GFNIAVX2:       # %bb.0:
 ; GFNIAVX2-NEXT:    vpsrlw $7, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; GFNIAVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpsrlw $7, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -363,9 +372,9 @@ define <64 x i8> @splatconstant_ashr_v64i8(<64 x i8> %a) nounwind {
 ; GFNIAVX2-LABEL: splatconstant_ashr_v64i8:
 ; GFNIAVX2:       # %bb.0:
 ; GFNIAVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
 ; GFNIAVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
 ; GFNIAVX2-NEXT:    vpxor %ymm3, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpsubb %ymm3, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpsrlw $1, %ymm1, %ymm1
@@ -377,7 +386,7 @@ define <64 x i8> @splatconstant_ashr_v64i8(<64 x i8> %a) nounwind {
 ; GFNIAVX512-LABEL: splatconstant_ashr_v64i8:
 ; GFNIAVX512:       # %bb.0:
 ; GFNIAVX512-NEXT:    vpsrlw $1, %zmm0, %zmm0
-; GFNIAVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; GFNIAVX512-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
 ; GFNIAVX512-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
 ; GFNIAVX512-NEXT:    vpsubb %zmm1, %zmm0, %zmm0
 ; GFNIAVX512-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
index 0c59e880dfd37..3af28d3b4c966 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
@@ -122,7 +122,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
 ; X64-AVX2-LABEL: test_reduce_v2i64:
 ; X64-AVX2:       ## %bb.0:
 ; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; X64-AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
 ; X64-AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm2
 ; X64-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2

diff  --git a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
index b64b0bf244139..5985dcae91842 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
@@ -124,7 +124,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
 ; X64-AVX2-LABEL: test_reduce_v2i64:
 ; X64-AVX2:       ## %bb.0:
 ; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; X64-AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
 ; X64-AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm2
 ; X64-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2

diff  --git a/llvm/test/CodeGen/X86/icmp-pow2-
diff .ll b/llvm/test/CodeGen/X86/icmp-pow2-
diff .ll
index 0b1137ff96643..f2f12654e6834 100644
--- a/llvm/test/CodeGen/X86/icmp-pow2-
diff .ll
+++ b/llvm/test/CodeGen/X86/icmp-pow2-
diff .ll
@@ -217,7 +217,7 @@ define <16 x i1> @andnot_ne_v16i8_fail_max_not_n1(<16 x i8> %x) nounwind {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; AVX512-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
 ; AVX512-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm0
 ; AVX512-NEXT:    vpandn %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
@@ -226,7 +226,7 @@ define <16 x i1> @andnot_ne_v16i8_fail_max_not_n1(<16 x i8> %x) nounwind {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
 ; AVX2-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    vpandn %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
index 364fd81eb1aa9..646ca1709c503 100644
--- a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
+++ b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
@@ -150,11 +150,23 @@ define <2 x i64> @elt0_v2i64(i64 %x) {
 ; X86-AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
 ; X86-AVX-NEXT:    retl
 ;
-; X64-AVX-LABEL: elt0_v2i64:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,1>
-; X64-AVX-NEXT:    vpinsrq $0, %rdi, %xmm0, %xmm0
-; X64-AVX-NEXT:    retq
+; X64-AVX1-LABEL: elt0_v2i64:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,1>
+; X64-AVX1-NEXT:    vpinsrq $0, %rdi, %xmm0, %xmm0
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: elt0_v2i64:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [1,1]
+; X64-AVX2-NEXT:    vpinsrq $0, %rdi, %xmm0, %xmm0
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512F-LABEL: elt0_v2i64:
+; X64-AVX512F:       # %bb.0:
+; X64-AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [1,1]
+; X64-AVX512F-NEXT:    vpinsrq $0, %rdi, %xmm0, %xmm0
+; X64-AVX512F-NEXT:    retq
    %ins = insertelement <2 x i64> <i64 42, i64 1>, i64 %x, i32 0
    ret <2 x i64> %ins
 }

diff  --git a/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll b/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll
index d65bf782d7994..e3c5a5023ac9e 100644
--- a/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll
+++ b/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll
@@ -425,7 +425,7 @@ define <2 x i64> @reassociate_umax_v2i64(<2 x i64> %x0, <2 x i64> %x1, <2 x i64>
 ; AVX2-LABEL: reassociate_umax_v2i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %xmm1, %xmm2, %xmm4
 ; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm5
 ; AVX2-NEXT:    vpcmpgtq %xmm5, %xmm4, %xmm4
@@ -723,7 +723,7 @@ define <2 x i64> @reassociate_umin_v2i64(<2 x i64> %x0, <2 x i64> %x1, <2 x i64>
 ; AVX2-LABEL: reassociate_umin_v2i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %xmm1, %xmm2, %xmm4
 ; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm5
 ; AVX2-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4

diff  --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll
index 17548df343251..a6f4296ae5640 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll
@@ -1494,7 +1494,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
 ; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
@@ -3915,7 +3915,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
 ; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]

diff  --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
index f605cd8271495..ffb3142df9fa9 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
@@ -2238,7 +2238,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm0, %ymm4
 ; AVX2-NEXT:    vblendvpd %ymm4, %ymm0, %ymm3, %ymm0
 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
 ; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
@@ -2451,10 +2451,10 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [2147483647,2147483647]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [2147483647,2147483647]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm3
 ; AVX2-NEXT:    vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968]
 ; AVX2-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm3
 ; AVX2-NEXT:    vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -2588,34 +2588,63 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE4-NEXT:    pextrw $1, %xmm0, 2(%rdi)
 ; SSE4-NEXT:    retq
 ;
-; AVX-LABEL: truncstore_v2i64_v2i16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [32767,32767]
-; AVX-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm4
-; AVX-NEXT:    vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848]
-; AVX-NEXT:    vpcmpgtq %xmm3, %xmm0, %xmm4
-; AVX-NEXT:    vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vmovmskpd %xmm1, %eax
-; AVX-NEXT:    xorl $3, %eax
-; AVX-NEXT:    testb $1, %al
-; AVX-NEXT:    jne .LBB7_1
-; AVX-NEXT:  # %bb.2: # %else
-; AVX-NEXT:    testb $2, %al
-; AVX-NEXT:    jne .LBB7_3
-; AVX-NEXT:  .LBB7_4: # %else2
-; AVX-NEXT:    retq
-; AVX-NEXT:  .LBB7_1: # %cond.store
-; AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; AVX-NEXT:    testb $2, %al
-; AVX-NEXT:    je .LBB7_4
-; AVX-NEXT:  .LBB7_3: # %cond.store1
-; AVX-NEXT:    vpextrw $1, %xmm0, 2(%rdi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: truncstore_v2i64_v2i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [32767,32767]
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm4
+; AVX1-NEXT:    vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848]
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm0, %xmm4
+; AVX1-NEXT:    vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vmovmskpd %xmm1, %eax
+; AVX1-NEXT:    xorl $3, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    jne .LBB7_1
+; AVX1-NEXT:  # %bb.2: # %else
+; AVX1-NEXT:    testb $2, %al
+; AVX1-NEXT:    jne .LBB7_3
+; AVX1-NEXT:  .LBB7_4: # %else2
+; AVX1-NEXT:    retq
+; AVX1-NEXT:  .LBB7_1: # %cond.store
+; AVX1-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; AVX1-NEXT:    testb $2, %al
+; AVX1-NEXT:    je .LBB7_4
+; AVX1-NEXT:  .LBB7_3: # %cond.store1
+; AVX1-NEXT:    vpextrw $1, %xmm0, 2(%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: truncstore_v2i64_v2i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [32767,32767]
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm4
+; AVX2-NEXT:    vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848]
+; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm0, %xmm4
+; AVX2-NEXT:    vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vmovmskpd %xmm1, %eax
+; AVX2-NEXT:    xorl $3, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    jne .LBB7_1
+; AVX2-NEXT:  # %bb.2: # %else
+; AVX2-NEXT:    testb $2, %al
+; AVX2-NEXT:    jne .LBB7_3
+; AVX2-NEXT:  .LBB7_4: # %else2
+; AVX2-NEXT:    retq
+; AVX2-NEXT:  .LBB7_1: # %cond.store
+; AVX2-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; AVX2-NEXT:    testb $2, %al
+; AVX2-NEXT:    je .LBB7_4
+; AVX2-NEXT:  .LBB7_3: # %cond.store1
+; AVX2-NEXT:    vpextrw $1, %xmm0, 2(%rdi)
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: truncstore_v2i64_v2i16:
 ; AVX512F:       # %bb.0:
@@ -2756,33 +2785,61 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE4-NEXT:    pextrb $1, %xmm2, 1(%rdi)
 ; SSE4-NEXT:    retq
 ;
-; AVX-LABEL: truncstore_v2i64_v2i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [127,127]
-; AVX-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm4
-; AVX-NEXT:    vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488]
-; AVX-NEXT:    vpcmpgtq %xmm3, %xmm0, %xmm4
-; AVX-NEXT:    vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vmovmskpd %xmm1, %eax
-; AVX-NEXT:    xorl $3, %eax
-; AVX-NEXT:    testb $1, %al
-; AVX-NEXT:    jne .LBB8_1
-; AVX-NEXT:  # %bb.2: # %else
-; AVX-NEXT:    testb $2, %al
-; AVX-NEXT:    jne .LBB8_3
-; AVX-NEXT:  .LBB8_4: # %else2
-; AVX-NEXT:    retq
-; AVX-NEXT:  .LBB8_1: # %cond.store
-; AVX-NEXT:    vpextrb $0, %xmm0, (%rdi)
-; AVX-NEXT:    testb $2, %al
-; AVX-NEXT:    je .LBB8_4
-; AVX-NEXT:  .LBB8_3: # %cond.store1
-; AVX-NEXT:    vpextrb $1, %xmm0, 1(%rdi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: truncstore_v2i64_v2i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [127,127]
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm4
+; AVX1-NEXT:    vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm0, %xmm4
+; AVX1-NEXT:    vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vmovmskpd %xmm1, %eax
+; AVX1-NEXT:    xorl $3, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    jne .LBB8_1
+; AVX1-NEXT:  # %bb.2: # %else
+; AVX1-NEXT:    testb $2, %al
+; AVX1-NEXT:    jne .LBB8_3
+; AVX1-NEXT:  .LBB8_4: # %else2
+; AVX1-NEXT:    retq
+; AVX1-NEXT:  .LBB8_1: # %cond.store
+; AVX1-NEXT:    vpextrb $0, %xmm0, (%rdi)
+; AVX1-NEXT:    testb $2, %al
+; AVX1-NEXT:    je .LBB8_4
+; AVX1-NEXT:  .LBB8_3: # %cond.store1
+; AVX1-NEXT:    vpextrb $1, %xmm0, 1(%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: truncstore_v2i64_v2i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [127,127]
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm4
+; AVX2-NEXT:    vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488]
+; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm0, %xmm4
+; AVX2-NEXT:    vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vmovmskpd %xmm1, %eax
+; AVX2-NEXT:    xorl $3, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    jne .LBB8_1
+; AVX2-NEXT:  # %bb.2: # %else
+; AVX2-NEXT:    testb $2, %al
+; AVX2-NEXT:    jne .LBB8_3
+; AVX2-NEXT:  .LBB8_4: # %else2
+; AVX2-NEXT:    retq
+; AVX2-NEXT:  .LBB8_1: # %cond.store
+; AVX2-NEXT:    vpextrb $0, %xmm0, (%rdi)
+; AVX2-NEXT:    testb $2, %al
+; AVX2-NEXT:    je .LBB8_4
+; AVX2-NEXT:  .LBB8_3: # %cond.store1
+; AVX2-NEXT:    vpextrb $1, %xmm0, 1(%rdi)
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: truncstore_v2i64_v2i8:
 ; AVX512F:       # %bb.0:

diff  --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
index 682e2002c075a..e288692d9eb89 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
@@ -1929,7 +1929,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm5, %ymm4
 ; AVX2-NEXT:    vblendvpd %ymm4, %ymm0, %ymm3, %ymm0
 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
 ; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
@@ -2125,7 +2125,7 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; AVX2-NEXT:    vmovddup {{.*#+}} xmm2 = [4294967295,4294967295]
 ; AVX2-NEXT:    # xmm2 = mem[0,0]
 ; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103]
 ; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
 ; AVX2-NEXT:    vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -2239,34 +2239,63 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE4-NEXT:    pextrw $1, %xmm0, 2(%rdi)
 ; SSE4-NEXT:    retq
 ;
-; AVX-LABEL: truncstore_v2i64_v2i16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = [65535,65535]
-; AVX-NEXT:    # xmm3 = mem[0,0]
-; AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343]
-; AVX-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
-; AVX-NEXT:    vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vmovmskpd %xmm1, %eax
-; AVX-NEXT:    xorl $3, %eax
-; AVX-NEXT:    testb $1, %al
-; AVX-NEXT:    jne .LBB7_1
-; AVX-NEXT:  # %bb.2: # %else
-; AVX-NEXT:    testb $2, %al
-; AVX-NEXT:    jne .LBB7_3
-; AVX-NEXT:  .LBB7_4: # %else2
-; AVX-NEXT:    retq
-; AVX-NEXT:  .LBB7_1: # %cond.store
-; AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; AVX-NEXT:    testb $2, %al
-; AVX-NEXT:    je .LBB7_4
-; AVX-NEXT:  .LBB7_3: # %cond.store1
-; AVX-NEXT:    vpextrw $1, %xmm0, 2(%rdi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: truncstore_v2i64_v2i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [65535,65535]
+; AVX1-NEXT:    # xmm3 = mem[0,0]
+; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343]
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vmovmskpd %xmm1, %eax
+; AVX1-NEXT:    xorl $3, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    jne .LBB7_1
+; AVX1-NEXT:  # %bb.2: # %else
+; AVX1-NEXT:    testb $2, %al
+; AVX1-NEXT:    jne .LBB7_3
+; AVX1-NEXT:  .LBB7_4: # %else2
+; AVX1-NEXT:    retq
+; AVX1-NEXT:  .LBB7_1: # %cond.store
+; AVX1-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; AVX1-NEXT:    testb $2, %al
+; AVX1-NEXT:    je .LBB7_4
+; AVX1-NEXT:  .LBB7_3: # %cond.store1
+; AVX1-NEXT:    vpextrw $1, %xmm0, 2(%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: truncstore_v2i64_v2i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vmovddup {{.*#+}} xmm3 = [65535,65535]
+; AVX2-NEXT:    # xmm3 = mem[0,0]
+; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343]
+; AVX2-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX2-NEXT:    vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vmovmskpd %xmm1, %eax
+; AVX2-NEXT:    xorl $3, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    jne .LBB7_1
+; AVX2-NEXT:  # %bb.2: # %else
+; AVX2-NEXT:    testb $2, %al
+; AVX2-NEXT:    jne .LBB7_3
+; AVX2-NEXT:  .LBB7_4: # %else2
+; AVX2-NEXT:    retq
+; AVX2-NEXT:  .LBB7_1: # %cond.store
+; AVX2-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; AVX2-NEXT:    testb $2, %al
+; AVX2-NEXT:    je .LBB7_4
+; AVX2-NEXT:  .LBB7_3: # %cond.store1
+; AVX2-NEXT:    vpextrw $1, %xmm0, 2(%rdi)
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: truncstore_v2i64_v2i16:
 ; AVX512F:       # %bb.0:
@@ -2387,33 +2416,61 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE4-NEXT:    pextrb $1, %xmm3, 1(%rdi)
 ; SSE4-NEXT:    retq
 ;
-; AVX-LABEL: truncstore_v2i64_v2i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = [255,255]
-; AVX-NEXT:    # xmm3 = mem[0,0]
-; AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063]
-; AVX-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
-; AVX-NEXT:    vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vmovmskpd %xmm1, %eax
-; AVX-NEXT:    xorl $3, %eax
-; AVX-NEXT:    testb $1, %al
-; AVX-NEXT:    jne .LBB8_1
-; AVX-NEXT:  # %bb.2: # %else
-; AVX-NEXT:    testb $2, %al
-; AVX-NEXT:    jne .LBB8_3
-; AVX-NEXT:  .LBB8_4: # %else2
-; AVX-NEXT:    retq
-; AVX-NEXT:  .LBB8_1: # %cond.store
-; AVX-NEXT:    vpextrb $0, %xmm0, (%rdi)
-; AVX-NEXT:    testb $2, %al
-; AVX-NEXT:    je .LBB8_4
-; AVX-NEXT:  .LBB8_3: # %cond.store1
-; AVX-NEXT:    vpextrb $1, %xmm0, 1(%rdi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: truncstore_v2i64_v2i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [255,255]
+; AVX1-NEXT:    # xmm3 = mem[0,0]
+; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063]
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vmovmskpd %xmm1, %eax
+; AVX1-NEXT:    xorl $3, %eax
+; AVX1-NEXT:    testb $1, %al
+; AVX1-NEXT:    jne .LBB8_1
+; AVX1-NEXT:  # %bb.2: # %else
+; AVX1-NEXT:    testb $2, %al
+; AVX1-NEXT:    jne .LBB8_3
+; AVX1-NEXT:  .LBB8_4: # %else2
+; AVX1-NEXT:    retq
+; AVX1-NEXT:  .LBB8_1: # %cond.store
+; AVX1-NEXT:    vpextrb $0, %xmm0, (%rdi)
+; AVX1-NEXT:    testb $2, %al
+; AVX1-NEXT:    je .LBB8_4
+; AVX1-NEXT:  .LBB8_3: # %cond.store1
+; AVX1-NEXT:    vpextrb $1, %xmm0, 1(%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: truncstore_v2i64_v2i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vmovddup {{.*#+}} xmm3 = [255,255]
+; AVX2-NEXT:    # xmm3 = mem[0,0]
+; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063]
+; AVX2-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX2-NEXT:    vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vmovmskpd %xmm1, %eax
+; AVX2-NEXT:    xorl $3, %eax
+; AVX2-NEXT:    testb $1, %al
+; AVX2-NEXT:    jne .LBB8_1
+; AVX2-NEXT:  # %bb.2: # %else
+; AVX2-NEXT:    testb $2, %al
+; AVX2-NEXT:    jne .LBB8_3
+; AVX2-NEXT:  .LBB8_4: # %else2
+; AVX2-NEXT:    retq
+; AVX2-NEXT:  .LBB8_1: # %cond.store
+; AVX2-NEXT:    vpextrb $0, %xmm0, (%rdi)
+; AVX2-NEXT:    testb $2, %al
+; AVX2-NEXT:    je .LBB8_4
+; AVX2-NEXT:  .LBB8_3: # %cond.store1
+; AVX2-NEXT:    vpextrb $1, %xmm0, 1(%rdi)
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: truncstore_v2i64_v2i8:
 ; AVX512F:       # %bb.0:
@@ -5867,7 +5924,7 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) {
 ; AVX2-LABEL: truncstore_v32i16_v32i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpminuw %ymm4, %ymm1, %ymm1
 ; AVX2-NEXT:    vpminuw %ymm4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
@@ -6107,7 +6164,7 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) {
 ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512F-NEXT:    vpminuw %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpminuw %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero

diff  --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
index 7e6cfc56574f5..c34ffc554f226 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
@@ -975,7 +975,7 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512F-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1
 ; AVX512F-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1]
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,1]
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
 ; AVX512F-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
 ; AVX512F-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm1
@@ -1011,7 +1011,7 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
 ; AVX512BW-FALLBACK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512BW-FALLBACK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1]
+; AVX512BW-FALLBACK-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,1]
 ; AVX512BW-FALLBACK-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
 ; AVX512BW-FALLBACK-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm1
@@ -1115,27 +1115,49 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
 ; SSE41-NEXT:    paddq %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: vec128_i64_unsigned_reg_reg:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm3
-; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm2
-; AVX-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
-; AVX-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; AVX-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vblendvpd %xmm2, %xmm4, %xmm1, %xmm1
-; AVX-NEXT:    vpsrlq $1, %xmm1, %xmm2
-; AVX-NEXT:    vpsrlq $33, %xmm1, %xmm1
-; AVX-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; AVX-NEXT:    vpsrlq $32, %xmm3, %xmm4
-; AVX-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
-; AVX-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
-; AVX-NEXT:    vpsllq $32, %xmm1, %xmm1
-; AVX-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; AVX-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
-; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: vec128_i64_unsigned_reg_reg:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
+; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm2
+; AVX1-NEXT:    vpsrlq $33, %xmm1, %xmm1
+; AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVX1-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
+; AVX1-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX1-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: vec128_i64_unsigned_reg_reg:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
+; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
+; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm4, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrlq $1, %xmm1, %xmm2
+; AVX2-NEXT:    vpsrlq $33, %xmm1, %xmm1
+; AVX2-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVX2-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
+; AVX2-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; AVX2-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX2-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; XOP-LABEL: vec128_i64_unsigned_reg_reg:
 ; XOP:       # %bb.0:
@@ -1162,7 +1184,7 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512F-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k1
 ; AVX512F-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1]
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,1]
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
 ; AVX512F-NEXT:    vpminuq %zmm1, %zmm0, %zmm2
 ; AVX512F-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm1
@@ -1198,7 +1220,7 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
 ; AVX512BW-FALLBACK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512BW-FALLBACK-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1]
+; AVX512BW-FALLBACK-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,1]
 ; AVX512BW-FALLBACK-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
 ; AVX512BW-FALLBACK-NEXT:    vpminuq %zmm1, %zmm0, %zmm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm1
@@ -1352,7 +1374,7 @@ define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwin
 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512F-NEXT:    vpcmpgtq %zmm0, %zmm1, %k1
 ; AVX512F-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1]
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,1]
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
 ; AVX512F-NEXT:    vpminsq %zmm0, %zmm1, %zmm2
 ; AVX512F-NEXT:    vpmaxsq %zmm0, %zmm1, %zmm0
@@ -1389,7 +1411,7 @@ define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwin
 ; AVX512BW-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpgtq %zmm0, %zmm1, %k1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1]
+; AVX512BW-FALLBACK-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,1]
 ; AVX512BW-FALLBACK-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
 ; AVX512BW-FALLBACK-NEXT:    vpminsq %zmm0, %zmm1, %zmm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxsq %zmm0, %zmm1, %zmm0
@@ -1542,7 +1564,7 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin
 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512F-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1
 ; AVX512F-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1]
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,1]
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
 ; AVX512F-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
 ; AVX512F-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm1
@@ -1579,7 +1601,7 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin
 ; AVX512BW-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1]
+; AVX512BW-FALLBACK-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,1]
 ; AVX512BW-FALLBACK-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
 ; AVX512BW-FALLBACK-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm1
@@ -1735,7 +1757,7 @@ define <2 x i64> @vec128_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX512F-NEXT:    vmovdqa (%rsi), %xmm1
 ; AVX512F-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1
 ; AVX512F-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1]
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,1]
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
 ; AVX512F-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
 ; AVX512F-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm1
@@ -1773,7 +1795,7 @@ define <2 x i64> @vec128_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX512BW-FALLBACK-NEXT:    vmovdqa (%rsi), %xmm1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1]
+; AVX512BW-FALLBACK-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,1]
 ; AVX512BW-FALLBACK-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
 ; AVX512BW-FALLBACK-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm1
@@ -1877,7 +1899,7 @@ define <8 x i16> @vec128_i16_signed_reg_reg(<8 x i16> %a1, <8 x i16> %a2) nounwi
 ; AVX512BW-FALLBACK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512BW-FALLBACK-NEXT:    vpcmpgtw %zmm1, %zmm0, %k1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
 ; AVX512BW-FALLBACK-NEXT:    vmovdqu16 %zmm2, %zmm3 {%k1}
 ; AVX512BW-FALLBACK-NEXT:    vpminsw %xmm1, %xmm0, %xmm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm1
@@ -2002,7 +2024,7 @@ define <8 x i16> @vec128_i16_unsigned_reg_reg(<8 x i16> %a1, <8 x i16> %a2) noun
 ; AVX512BW-FALLBACK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512BW-FALLBACK-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
 ; AVX512BW-FALLBACK-NEXT:    vmovdqu16 %zmm2, %zmm3 {%k1}
 ; AVX512BW-FALLBACK-NEXT:    vpminuw %xmm1, %xmm0, %xmm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm1
@@ -2110,7 +2132,7 @@ define <8 x i16> @vec128_i16_signed_mem_reg(ptr %a1_addr, <8 x i16> %a2) nounwin
 ; AVX512BW-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpgtw %zmm0, %zmm1, %k1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
 ; AVX512BW-FALLBACK-NEXT:    vmovdqu16 %zmm2, %zmm3 {%k1}
 ; AVX512BW-FALLBACK-NEXT:    vpminsw %xmm0, %xmm1, %xmm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxsw %xmm0, %xmm1, %xmm0
@@ -2218,7 +2240,7 @@ define <8 x i16> @vec128_i16_signed_reg_mem(<8 x i16> %a1, ptr %a2_addr) nounwin
 ; AVX512BW-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpgtw %zmm1, %zmm0, %k1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
 ; AVX512BW-FALLBACK-NEXT:    vmovdqu16 %zmm2, %zmm3 {%k1}
 ; AVX512BW-FALLBACK-NEXT:    vpminsw %xmm1, %xmm0, %xmm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm1
@@ -2331,7 +2353,7 @@ define <8 x i16> @vec128_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX512BW-FALLBACK-NEXT:    vmovdqa (%rsi), %xmm1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpgtw %zmm1, %zmm0, %k1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
 ; AVX512BW-FALLBACK-NEXT:    vmovdqu16 %zmm2, %zmm3 {%k1}
 ; AVX512BW-FALLBACK-NEXT:    vpminsw %xmm1, %xmm0, %xmm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm1
@@ -2565,7 +2587,7 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
 ; AVX512BW-FALLBACK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512BW-FALLBACK-NEXT:    vpcmpgtb %zmm1, %zmm0, %k1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT:    vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX512BW-FALLBACK-NEXT:    vmovdqu8 %zmm2, %zmm3 {%k1}
 ; AVX512BW-FALLBACK-NEXT:    vpminsb %xmm1, %xmm0, %xmm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
@@ -2798,7 +2820,7 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
 ; AVX512BW-FALLBACK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512BW-FALLBACK-NEXT:    vpcmpnleub %zmm1, %zmm0, %k1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT:    vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX512BW-FALLBACK-NEXT:    vmovdqu8 %zmm2, %zmm3 {%k1}
 ; AVX512BW-FALLBACK-NEXT:    vpminub %xmm1, %xmm0, %xmm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxub %xmm1, %xmm0, %xmm1
@@ -3040,7 +3062,7 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind
 ; AVX512BW-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpgtb %zmm0, %zmm1, %k1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT:    vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX512BW-FALLBACK-NEXT:    vmovdqu8 %zmm2, %zmm3 {%k1}
 ; AVX512BW-FALLBACK-NEXT:    vpminsb %xmm0, %xmm1, %xmm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxsb %xmm0, %xmm1, %xmm0
@@ -3280,7 +3302,7 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind
 ; AVX512BW-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpgtb %zmm1, %zmm0, %k1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT:    vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX512BW-FALLBACK-NEXT:    vmovdqu8 %zmm2, %zmm3 {%k1}
 ; AVX512BW-FALLBACK-NEXT:    vpminsb %xmm1, %xmm0, %xmm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
@@ -3529,7 +3551,7 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX512BW-FALLBACK-NEXT:    vmovdqa (%rsi), %xmm1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpgtb %zmm1, %zmm0, %k1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT:    vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX512BW-FALLBACK-NEXT:    vmovdqu8 %zmm2, %zmm3 {%k1}
 ; AVX512BW-FALLBACK-NEXT:    vpminsb %xmm1, %xmm0, %xmm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1

diff  --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
index 960a55f01aec9..8d9ec21971f16 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
@@ -1333,7 +1333,7 @@ define <16 x i16> @vec256_i16_signed_reg_reg(<16 x i16> %a1, <16 x i16> %a2) nou
 ; AVX512BW-FALLBACK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512BW-FALLBACK-NEXT:    vpcmpgtw %zmm1, %zmm0, %k1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX512BW-FALLBACK-NEXT:    vmovdqu16 %zmm2, %zmm3 {%k1}
 ; AVX512BW-FALLBACK-NEXT:    vpminsw %ymm1, %ymm0, %ymm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm1
@@ -1461,7 +1461,7 @@ define <16 x i16> @vec256_i16_unsigned_reg_reg(<16 x i16> %a1, <16 x i16> %a2) n
 ; AVX512BW-FALLBACK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512BW-FALLBACK-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX512BW-FALLBACK-NEXT:    vmovdqu16 %zmm2, %zmm3 {%k1}
 ; AVX512BW-FALLBACK-NEXT:    vpminuw %ymm1, %ymm0, %ymm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm1
@@ -1589,7 +1589,7 @@ define <16 x i16> @vec256_i16_signed_mem_reg(ptr %a1_addr, <16 x i16> %a2) nounw
 ; AVX512BW-FALLBACK-NEXT:    vmovdqa (%rdi), %ymm1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpgtw %zmm0, %zmm1, %k1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX512BW-FALLBACK-NEXT:    vmovdqu16 %zmm2, %zmm3 {%k1}
 ; AVX512BW-FALLBACK-NEXT:    vpminsw %ymm0, %ymm1, %ymm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxsw %ymm0, %ymm1, %ymm0
@@ -1717,7 +1717,7 @@ define <16 x i16> @vec256_i16_signed_reg_mem(<16 x i16> %a1, ptr %a2_addr) nounw
 ; AVX512BW-FALLBACK-NEXT:    vmovdqa (%rdi), %ymm1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpgtw %zmm1, %zmm0, %k1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX512BW-FALLBACK-NEXT:    vmovdqu16 %zmm2, %zmm3 {%k1}
 ; AVX512BW-FALLBACK-NEXT:    vpminsw %ymm1, %ymm0, %ymm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm1
@@ -1850,7 +1850,7 @@ define <16 x i16> @vec256_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwin
 ; AVX512BW-FALLBACK-NEXT:    vmovdqa (%rsi), %ymm1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpgtw %zmm1, %zmm0, %k1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX512BW-FALLBACK-NEXT:    vmovdqu16 %zmm2, %zmm3 {%k1}
 ; AVX512BW-FALLBACK-NEXT:    vpminsw %ymm1, %ymm0, %ymm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm1
@@ -1949,7 +1949,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX2-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -2009,7 +2009,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512F-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512F-NEXT:    vpand %ymm4, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -2037,7 +2037,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
 ; AVX512BW-FALLBACK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512BW-FALLBACK-NEXT:    vpcmpgtb %zmm1, %zmm0, %k1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX512BW-FALLBACK-NEXT:    vmovdqu8 %zmm2, %zmm3 {%k1}
 ; AVX512BW-FALLBACK-NEXT:    vpminsb %ymm1, %ymm0, %ymm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm1
@@ -2136,7 +2136,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX2-NEXT:    vpmullw %ymm4, %ymm2, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -2197,7 +2197,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512F-NEXT:    vpmullw %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512F-NEXT:    vpand %ymm4, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -2226,7 +2226,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
 ; AVX512BW-FALLBACK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512BW-FALLBACK-NEXT:    vpcmpnleub %zmm1, %zmm0, %k1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX512BW-FALLBACK-NEXT:    vmovdqu8 %zmm2, %zmm3 {%k1}
 ; AVX512BW-FALLBACK-NEXT:    vpminub %ymm1, %ymm0, %ymm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxub %ymm1, %ymm0, %ymm1
@@ -2324,7 +2324,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX2-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -2386,7 +2386,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512F-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512F-NEXT:    vpand %ymm4, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -2415,7 +2415,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
 ; AVX512BW-FALLBACK-NEXT:    vmovdqa (%rdi), %ymm1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpgtb %zmm0, %zmm1, %k1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX512BW-FALLBACK-NEXT:    vmovdqu8 %zmm2, %zmm3 {%k1}
 ; AVX512BW-FALLBACK-NEXT:    vpminsb %ymm0, %ymm1, %ymm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxsb %ymm0, %ymm1, %ymm0
@@ -2513,7 +2513,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX2-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -2575,7 +2575,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512F-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512F-NEXT:    vpand %ymm4, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -2604,7 +2604,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
 ; AVX512BW-FALLBACK-NEXT:    vmovdqa (%rdi), %ymm1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpgtb %zmm1, %zmm0, %k1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX512BW-FALLBACK-NEXT:    vmovdqu8 %zmm2, %zmm3 {%k1}
 ; AVX512BW-FALLBACK-NEXT:    vpminsb %ymm1, %ymm0, %ymm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm1
@@ -2704,7 +2704,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX2-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -2768,7 +2768,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512F-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512F-NEXT:    vpand %ymm4, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -2798,7 +2798,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX512BW-FALLBACK-NEXT:    vmovdqa (%rsi), %ymm1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpgtb %zmm1, %zmm0, %k1
 ; AVX512BW-FALLBACK-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-FALLBACK-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX512BW-FALLBACK-NEXT:    vmovdqu8 %zmm2, %zmm3 {%k1}
 ; AVX512BW-FALLBACK-NEXT:    vpminsb %ymm1, %ymm0, %ymm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm1

diff  --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
index 57fdd3efcf231..8dffb2c855926 100644
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -918,7 +918,7 @@ define dso_local void @mul256(<64 x i8>* %a, <64 x i8>* %b, <64 x i8>* %c) "min-
 ; CHECK-AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; CHECK-AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; CHECK-AVX512-NEXT:    vpmullw %ymm4, %ymm5, %ymm4
-; CHECK-AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-AVX512-NEXT:    vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; CHECK-AVX512-NEXT:    vpand %ymm5, %ymm4, %ymm4
 ; CHECK-AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; CHECK-AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -995,7 +995,7 @@ define dso_local void @mul512(<64 x i8>* %a, <64 x i8>* %b, <64 x i8>* %c) "min-
 ; CHECK-AVX512-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
 ; CHECK-AVX512-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
 ; CHECK-AVX512-NEXT:    vpmullw %zmm2, %zmm3, %zmm2
-; CHECK-AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-AVX512-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; CHECK-AVX512-NEXT:    vpandq %zmm3, %zmm2, %zmm2
 ; CHECK-AVX512-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
 ; CHECK-AVX512-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]

diff  --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll
index 1e31d88e88f3f..38b77c6c09b7d 100644
--- a/llvm/test/CodeGen/X86/movmsk-cmp.ll
+++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll
@@ -1929,7 +1929,7 @@ define i1 @allones_v2i64_and1(<2 x i64> %arg) {
 ; KNL-LABEL: allones_v2i64_and1:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1]
+; KNL-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [1,1]
 ; KNL-NEXT:    vptestnmq %zmm1, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    testb $3, %al
@@ -3075,7 +3075,7 @@ define i1 @allones_v2i64_and4(<2 x i64> %arg) {
 ; KNL-LABEL: allones_v2i64_and4:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,4]
+; KNL-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4,4]
 ; KNL-NEXT:    vptestnmq %zmm1, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    testb $3, %al

diff  --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index fa36c15b6445a..529e0ad24936a 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -1994,7 +1994,8 @@ define void @splat3_128(<16 x i8> %a0, <16 x i8> %a1, ptr%a2) {
 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
@@ -2165,7 +2166,8 @@ define void @splat3_256(<32 x i8> %a0, ptr%a1) {
 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    vpshufb %ymm4, %ymm0, %ymm0

diff  --git a/llvm/test/CodeGen/X86/paddus.ll b/llvm/test/CodeGen/X86/paddus.ll
index 40d6ec6fb3155..06388362f91b7 100644
--- a/llvm/test/CodeGen/X86/paddus.ll
+++ b/llvm/test/CodeGen/X86/paddus.ll
@@ -486,7 +486,7 @@ define <64 x i8> @test14(<64 x i8> %x) {
 ;
 ; AVX2-LABEL: test14:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
 ; AVX2-NEXT:    vpaddusb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddusb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    retq
@@ -526,7 +526,7 @@ define <64 x i8> @test15(<64 x i8> %x) {
 ;
 ; AVX2-LABEL: test15:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129]
 ; AVX2-NEXT:    vpaddusb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddusb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    retq
@@ -566,7 +566,7 @@ define <64 x i8> @test16(<64 x i8> %x) {
 ;
 ; AVX2-LABEL: test16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254]
 ; AVX2-NEXT:    vpaddusb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddusb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    retq
@@ -641,7 +641,7 @@ define <64 x i8> @test17(<64 x i8> %x) {
 ;
 ; AVX2-LABEL: test17:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm3
 ; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm2
 ; AVX2-NEXT:    vpmaxub %ymm1, %ymm2, %ymm1
@@ -694,7 +694,7 @@ define <64 x i8> @test18(<64 x i8> %x) {
 ;
 ; AVX2-LABEL: test18:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
 ; AVX2-NEXT:    vpaddusb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddusb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    retq
@@ -1238,7 +1238,7 @@ define <32 x i16> @test32(<32 x i16> %x) {
 ;
 ; AVX2-LABEL: test32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767]
 ; AVX2-NEXT:    vpaddusw %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddusw %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    retq
@@ -1278,7 +1278,7 @@ define <32 x i16> @test33(<32 x i16> %x) {
 ;
 ; AVX2-LABEL: test33:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769]
 ; AVX2-NEXT:    vpaddusw %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddusw %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    retq
@@ -1318,7 +1318,7 @@ define <32 x i16> @test34(<32 x i16> %x) {
 ;
 ; AVX2-LABEL: test34:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534]
 ; AVX2-NEXT:    vpaddusw %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddusw %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    retq
@@ -1449,7 +1449,7 @@ define <32 x i16> @test35(<32 x i16> %x) {
 ;
 ; AVX2-LABEL: test35:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
 ; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm3
 ; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm2
 ; AVX2-NEXT:    vpmaxuw %ymm1, %ymm2, %ymm1
@@ -1502,7 +1502,7 @@ define <32 x i16> @test36(<32 x i16> %x) {
 ;
 ; AVX2-LABEL: test36:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
 ; AVX2-NEXT:    vpaddusw %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddusw %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/pmaddubsw.ll b/llvm/test/CodeGen/X86/pmaddubsw.ll
index 2919a3019e75d..a553fbe73b227 100644
--- a/llvm/test/CodeGen/X86/pmaddubsw.ll
+++ b/llvm/test/CodeGen/X86/pmaddubsw.ll
@@ -348,9 +348,9 @@ define <8 x i16> @pmaddubsw_bad_extend(ptr %Aptr, ptr %Bptr) {
 ; AVX256:       # %bb.0:
 ; AVX256-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX256-NEXT:    vmovdqa (%rsi), %xmm1
-; AVX256-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX256-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
 ; AVX256-NEXT:    vpshufb %xmm2, %xmm0, %xmm3
-; AVX256-NEXT:    vmovdqa {{.*#+}} xmm4 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX256-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
 ; AVX256-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
 ; AVX256-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
 ; AVX256-NEXT:    vpshufb %xmm4, %xmm1, %xmm1

diff  --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll
index 1b2dae5f2830a..8e6ae4b552657 100644
--- a/llvm/test/CodeGen/X86/pmul.ll
+++ b/llvm/test/CodeGen/X86/pmul.ll
@@ -122,7 +122,7 @@ define <2 x i64> @mul_v2i64c(<2 x i64> %i) nounwind  {
 ;
 ; AVX-LABEL: mul_v2i64c:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [117,117]
+; AVX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [117,117]
 ; AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
 ; AVX-NEXT:    vpsrlq $32, %xmm0, %xmm0
 ; AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
@@ -417,9 +417,9 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind  {
 ; AVX2-LABEL: mul_v32i8c:
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
 ; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
@@ -430,9 +430,9 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind  {
 ; AVX512F-LABEL: mul_v32i8c:
 ; AVX512F:       # %bb.0: # %entry
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
 ; AVX512F-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512F-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
@@ -593,7 +593,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind  {
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX2-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -607,7 +607,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind  {
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512F-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -799,9 +799,9 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind  {
 ; AVX2-LABEL: mul_v64i8c:
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
 ; AVX2-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX2-NEXT:    vpmullw %ymm3, %ymm0, %ymm0
@@ -820,9 +820,9 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind  {
 ; AVX512F:       # %bb.0: # %entry
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
 ; AVX512F-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512F-NEXT:    vpand %ymm4, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512F-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
@@ -841,9 +841,9 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind  {
 ; AVX512BW-LABEL: mul_v64i8c:
 ; AVX512BW:       # %bb.0: # %entry
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
 ; AVX512BW-NEXT:    vpmullw %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512BW-NEXT:    vpandq %zmm3, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
 ; AVX512BW-NEXT:    vpmullw %zmm2, %zmm0, %zmm0
@@ -955,7 +955,7 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind  {
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX2-NEXT:    vpmullw %ymm4, %ymm5, %ymm4
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpand %ymm5, %ymm4, %ymm4
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -980,7 +980,7 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind  {
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512F-NEXT:    vpmullw %ymm3, %ymm5, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512F-NEXT:    vpand %ymm5, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -1004,7 +1004,7 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind  {
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
 ; AVX512BW-NEXT:    vpmullw %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]

diff  --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll
index 1d7aa7f2586fe..f28f07e605fd8 100644
--- a/llvm/test/CodeGen/X86/pmulh.ll
+++ b/llvm/test/CodeGen/X86/pmulh.ll
@@ -372,7 +372,7 @@ define <16 x i16> @and_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) {
 ; AVX512F-LABEL: and_mulhuw_v16i16:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
 ; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0

diff  --git a/llvm/test/CodeGen/X86/pr37499.ll b/llvm/test/CodeGen/X86/pr37499.ll
index 2995017275c18..15a7739fd2c7f 100644
--- a/llvm/test/CodeGen/X86/pr37499.ll
+++ b/llvm/test/CodeGen/X86/pr37499.ll
@@ -4,7 +4,7 @@
 define <2 x i64> @undef_tval() {
 ; CHECK-LABEL: undef_tval:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1]
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1]
 ; CHECK-NEXT:    movb $1, %al
 ; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpmovqw %zmm0, %xmm0 {%k1}
@@ -18,7 +18,7 @@ define <2 x i64> @undef_tval() {
 define <2 x i64> @foo(<8 x i64> %x) {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
 ; CHECK-NEXT:    movb $1, %al
 ; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpmovqw %zmm0, %xmm1 {%k1}
@@ -33,7 +33,7 @@ define <2 x i64> @foo(<8 x i64> %x) {
 define <4 x i64> @goo(<16 x i32> %x) {
 ; CHECK-LABEL: goo:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; CHECK-NEXT:    movw $1, %ax
 ; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpmovdw %zmm0, %ymm1 {%k1}

diff  --git a/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll b/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll
index 0ce83b190ead8..bbe46a99ffa41 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll
@@ -89,7 +89,8 @@ define <16 x i16> @testv16i16(<16 x i16> %in) {
 define <32 x i8> @testv32i8(<32 x i8> %in) {
 ; AVX256-LABEL: testv32i8:
 ; AVX256:       # %bb.0:
-; AVX256-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX256-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX256-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX256-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX256-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX256-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0

diff  --git a/llvm/test/CodeGen/X86/prefer-avx256-popcnt.ll b/llvm/test/CodeGen/X86/prefer-avx256-popcnt.ll
index 06d4b6c1c197b..34e32c43ef797 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-popcnt.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-popcnt.ll
@@ -36,7 +36,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) {
 define <16 x i8> @testv16i8(<16 x i8> %in) {
 ; AVX256-LABEL: testv16i8:
 ; AVX256:       # %bb.0:
-; AVX256-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX256-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX256-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX256-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX256-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -60,9 +60,10 @@ define <16 x i8> @testv16i8(<16 x i8> %in) {
 define <16 x i16> @testv16i16(<16 x i16> %in) {
 ; AVX256-LABEL: testv16i16:
 ; AVX256:       # %bb.0:
-; AVX256-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX256-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX256-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX256-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX256-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX256-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX256-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX256-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX256-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -84,17 +85,44 @@ define <16 x i16> @testv16i16(<16 x i16> %in) {
 }
 
 define <32 x i8> @testv32i8(<32 x i8> %in) {
-; CHECK-LABEL: testv32i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; CHECK-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; CHECK-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; CHECK-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; CHECK-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; CHECK-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; AVX256-LABEL: testv32i8:
+; AVX256:       # %bb.0:
+; AVX256-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX256-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX256-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX256-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX256-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX256-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX256-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX256-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX256-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX256-NEXT:    retq
+;
+; AVX512VL-LABEL: testv32i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VL-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512VL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512F-LABEL: testv32i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
   %out = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %in)
   ret <32 x i8> %out
 }
@@ -103,3 +131,5 @@ declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>)
 declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>)
 declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>)
 declare <32 x i8> @llvm.ctpop.v32i8(<32 x i8>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}

diff  --git a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
index 91d4aa6c91dbb..f627560f9f382 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
@@ -9,7 +9,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) {
 ; AVX256BW:       # %bb.0:
 ; AVX256BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX256BW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX256BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX256BW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
 ; AVX256BW-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
 ; AVX256BW-NEXT:    vpsrlw $8, %ymm2, %ymm2
 ; AVX256BW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
@@ -61,7 +61,7 @@ define <32 x i8> @test_mul_32i8(<32 x i8> %a, <32 x i8> %b) {
 ; AVX256BW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX256BW-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX256BW-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
-; AVX256BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX256BW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX256BW-NEXT:    vpand %ymm3, %ymm2, %ymm2
 ; AVX256BW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX256BW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]

diff  --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll
index faeaef7b40a62..b4b2adba2314f 100644
--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll
@@ -2445,7 +2445,7 @@ define <64 x i8> @test27(<64 x i8> %x) {
 ;
 ; AVX2-LABEL: test27:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154]
 ; AVX2-NEXT:    vpsubusb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsubusb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
index cb89a6595ad3b..3207fe99b6850 100644
--- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
@@ -530,14 +530,14 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
 ; AVX2-LABEL: v16i4:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpsllw $4, %xmm1, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
 ; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsllw $4, %xmm0, %xmm0
 ; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
@@ -545,14 +545,14 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
 ; AVX512F-LABEL: v16i4:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsllw $4, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
 ; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpsllw $4, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpsrlw $4, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    retq
@@ -560,13 +560,13 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
 ; AVX512BW-LABEL: v16i4:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %xmm1, %xmm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
 ; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsllw $4, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; AVX512BW-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
 ; AVX512BW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll
index 48a3155cea341..f41d105b6f4f4 100644
--- a/llvm/test/CodeGen/X86/sat-add.ll
+++ b/llvm/test/CodeGen/X86/sat-add.ll
@@ -659,7 +659,7 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_min(<2 x i64> %x) {
 ; AVX2-NEXT:    vmovddup {{.*#+}} xmm1 = [18446744073709551573,18446744073709551573]
 ; AVX2-NEXT:    # xmm1 = mem[0,0]
 ; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854775765,9223372036854775765]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [9223372036854775765,9223372036854775765]
 ; AVX2-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
 ; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -726,7 +726,7 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_sum(<2 x i64> %x) {
 ;
 ; AVX2-LABEL: unsigned_sat_constant_v2i64_using_cmp_sum:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm1
@@ -785,7 +785,7 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_notval(<2 x i64> %x) {
 ;
 ; AVX2-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm1
@@ -1267,7 +1267,7 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_cmp_sum(<2 x i64> %x, <2 x i
 ;
 ; AVX2-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
 ; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm1

diff  --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
index 9b130f1f13841..483c16d6531b4 100644
--- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
+++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
@@ -108,8 +108,8 @@ define void @failing(ptr %0, ptr %1) nounwind {
 ; CHECK-AVX2-NEXT:    movq 32(%rsi), %rdx
 ; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
 ; CHECK-AVX2-NEXT:    xorl %esi, %esi
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1]
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,2]
+; CHECK-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [1,1]
+; CHECK-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [2,2]
 ; CHECK-AVX2-NEXT:    .p2align 4, 0x90
 ; CHECK-AVX2-NEXT:  .LBB0_1: # %vector.ph
 ; CHECK-AVX2-NEXT:    # =>This Loop Header: Depth=1

diff  --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
index 52dd300f4efeb..3b9c5a34bcccc 100644
--- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
+++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
@@ -12,22 +12,33 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
 
 define void @shuffle_v32i8_to_v16i8_1(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v16i8_1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v32i8_to_v16i8_1:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v16i8_1:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v16i8_1:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -38,7 +49,7 @@ define void @shuffle_v32i8_to_v16i8_1(ptr %L, ptr %S) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -135,16 +146,27 @@ define void @shuffle_v8i32_to_v4i32_1(ptr %L, ptr %S) nounwind {
 }
 
 define void @shuffle_v32i8_to_v8i8_1(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v8i8_1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT:    vmovq %xmm0, (%rsi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v32i8_to_v8i8_1:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT:    vmovq %xmm0, (%rsi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v8i8_1:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT:    vmovq %xmm0, (%rsi)
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_1:
 ; AVX512F:       # %bb.0:
@@ -184,16 +206,27 @@ define void @shuffle_v32i8_to_v8i8_1(ptr %L, ptr %S) nounwind {
 }
 
 define void @shuffle_v32i8_to_v8i8_2(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v8i8_2:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT:    vmovq %xmm0, (%rsi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v32i8_to_v8i8_2:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT:    vmovq %xmm0, (%rsi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v8i8_2:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT:    vmovq %xmm0, (%rsi)
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_2:
 ; AVX512F:       # %bb.0:
@@ -233,16 +266,27 @@ define void @shuffle_v32i8_to_v8i8_2(ptr %L, ptr %S) nounwind {
 }
 
 define void @shuffle_v32i8_to_v8i8_3(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v8i8_3:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT:    vmovq %xmm0, (%rsi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v32i8_to_v8i8_3:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT:    vmovq %xmm0, (%rsi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v8i8_3:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT:    vmovq %xmm0, (%rsi)
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_3:
 ; AVX512F:       # %bb.0:
@@ -489,16 +533,27 @@ define void @shuffle_v16i16_to_v4i16_3(ptr %L, ptr %S) nounwind {
 }
 
 define void @shuffle_v32i8_to_v4i8_1(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v4i8_1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT:    vmovd %xmm0, (%rsi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v32i8_to_v4i8_1:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vmovd %xmm0, (%rsi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v4i8_1:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT:    vmovd %xmm0, (%rsi)
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_1:
 ; AVX512F:       # %bb.0:
@@ -538,16 +593,27 @@ define void @shuffle_v32i8_to_v4i8_1(ptr %L, ptr %S) nounwind {
 }
 
 define void @shuffle_v32i8_to_v4i8_2(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v4i8_2:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT:    vmovd %xmm0, (%rsi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v32i8_to_v4i8_2:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vmovd %xmm0, (%rsi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v4i8_2:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT:    vmovd %xmm0, (%rsi)
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_2:
 ; AVX512F:       # %bb.0:
@@ -587,16 +653,27 @@ define void @shuffle_v32i8_to_v4i8_2(ptr %L, ptr %S) nounwind {
 }
 
 define void @shuffle_v32i8_to_v4i8_3(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v4i8_3:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT:    vmovd %xmm0, (%rsi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v32i8_to_v4i8_3:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vmovd %xmm0, (%rsi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v4i8_3:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT:    vmovd %xmm0, (%rsi)
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_3:
 ; AVX512F:       # %bb.0:
@@ -636,16 +713,27 @@ define void @shuffle_v32i8_to_v4i8_3(ptr %L, ptr %S) nounwind {
 }
 
 define void @shuffle_v32i8_to_v4i8_4(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v4i8_4:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT:    vmovd %xmm0, (%rsi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v32i8_to_v4i8_4:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vmovd %xmm0, (%rsi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v4i8_4:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT:    vmovd %xmm0, (%rsi)
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_4:
 ; AVX512F:       # %bb.0:
@@ -685,16 +773,27 @@ define void @shuffle_v32i8_to_v4i8_4(ptr %L, ptr %S) nounwind {
 }
 
 define void @shuffle_v32i8_to_v4i8_5(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v4i8_5:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT:    vmovd %xmm0, (%rsi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v32i8_to_v4i8_5:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vmovd %xmm0, (%rsi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v4i8_5:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT:    vmovd %xmm0, (%rsi)
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_5:
 ; AVX512F:       # %bb.0:
@@ -734,16 +833,27 @@ define void @shuffle_v32i8_to_v4i8_5(ptr %L, ptr %S) nounwind {
 }
 
 define void @shuffle_v32i8_to_v4i8_6(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v4i8_6:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT:    vmovd %xmm0, (%rsi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v32i8_to_v4i8_6:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vmovd %xmm0, (%rsi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v4i8_6:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT:    vmovd %xmm0, (%rsi)
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_6:
 ; AVX512F:       # %bb.0:
@@ -783,16 +893,27 @@ define void @shuffle_v32i8_to_v4i8_6(ptr %L, ptr %S) nounwind {
 }
 
 define void @shuffle_v32i8_to_v4i8_7(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v4i8_7:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT:    vmovd %xmm0, (%rsi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v32i8_to_v4i8_7:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vmovd %xmm0, (%rsi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v4i8_7:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT:    vmovd %xmm0, (%rsi)
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_7:
 ; AVX512F:       # %bb.0:

diff  --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
index b042ce13bd627..07e1c56a11de4 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
@@ -18,18 +18,27 @@
 ; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
 
 define void @shuffle_v32i8_to_v16i8(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v16i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
-; AVX-NEXT:    vpand 16(%rdi), %xmm0, %xmm1
-; AVX-NEXT:    vpand (%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v32i8_to_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand 16(%rdi), %xmm0, %xmm1
+; AVX1-NEXT:    vpand (%rdi), %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand 16(%rdi), %xmm0, %xmm1
+; AVX2-NEXT:    vpand (%rdi), %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v16i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
 ; AVX512F-NEXT:    vpand 16(%rdi), %xmm0, %xmm1
 ; AVX512F-NEXT:    vpand (%rdi), %xmm0, %xmm0
 ; AVX512F-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
@@ -38,7 +47,7 @@ define void @shuffle_v32i8_to_v16i8(ptr %L, ptr %S) nounwind {
 ;
 ; AVX512VL-LABEL: shuffle_v32i8_to_v16i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
 ; AVX512VL-NEXT:    vpand 16(%rdi), %xmm0, %xmm1
 ; AVX512VL-NEXT:    vpand (%rdi), %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
@@ -373,16 +382,27 @@ define void @trunc_v4i64_to_v4i32(ptr %L, ptr %S) nounwind {
 }
 
 define void @shuffle_v32i8_to_v8i8(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v8i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT:    vmovq %xmm0, (%rsi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v32i8_to_v8i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT:    vmovq %xmm0, (%rsi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT:    vmovq %xmm0, (%rsi)
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8:
 ; AVX512F:       # %bb.0:
@@ -427,16 +447,27 @@ define void @shuffle_v32i8_to_v8i8(ptr %L, ptr %S) nounwind {
 }
 
 define void @trunc_v8i32_to_v8i8(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: trunc_v8i32_to_v8i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT:    vmovq %xmm0, (%rsi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: trunc_v8i32_to_v8i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT:    vmovq %xmm0, (%rsi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_v8i32_to_v8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT:    vmovq %xmm0, (%rsi)
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_v8i32_to_v8i8:
 ; AVX512F:       # %bb.0:
@@ -498,7 +529,7 @@ define <2 x i64> @trunc_v8i32_to_v8i8_return_v2i64(<8 x i32> %vec) nounwind {
 ; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -671,7 +702,7 @@ define <16 x i8> @trunc_v8i32_to_v8i8_return_v16i8(<8 x i32> %vec) nounwind {
 ; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -800,7 +831,8 @@ define <8 x i16> @trunc_v4i64_to_v4i16_with_zext_return_v8i16(<4 x i64> %vec) no
 ;
 ; AVX2-FAST-ALL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-FAST-ALL-NEXT:    vzeroupper
@@ -871,7 +903,8 @@ define <8 x i16> @trunc_v4i64_to_v4i16_via_v4i32_return_v8i16(<4 x i64> %vec) no
 ;
 ; AVX2-FAST-ALL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-FAST-ALL-NEXT:    vzeroupper
@@ -999,7 +1032,7 @@ define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind {
 ; AVX2-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -1156,16 +1189,27 @@ define void @trunc_v4i64_to_v4i16(ptr %L, ptr %S) nounwind {
 }
 
 define void @shuffle_v32i8_to_v4i8(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v4i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT:    vmovd %xmm0, (%rsi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v32i8_to_v4i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vmovd %xmm0, (%rsi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v4i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT:    vmovd %xmm0, (%rsi)
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8:
 ; AVX512F:       # %bb.0:
@@ -1210,16 +1254,27 @@ define void @shuffle_v32i8_to_v4i8(ptr %L, ptr %S) nounwind {
 }
 
 define void @trunc_v4i64_to_v4i8(ptr %L, ptr %S) nounwind {
-; AVX-LABEL: trunc_v4i64_to_v4i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT:    vmovd %xmm0, (%rsi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: trunc_v4i64_to_v4i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vmovd %xmm0, (%rsi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_v4i64_to_v4i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT:    vmovd %xmm0, (%rsi)
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_v4i64_to_v4i8:
 ; AVX512F:       # %bb.0:
@@ -1281,7 +1336,8 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind {
 ; AVX2-LABEL: negative:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -1291,7 +1347,8 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind {
 ; AVX512F-LABEL: negative:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -1310,7 +1367,8 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind {
 ; AVX512BW-LABEL: negative:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -1330,7 +1388,8 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind {
 ;
 ; AVX512VBMIVL-LABEL: negative:
 ; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm2 = <32,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMIVL-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [32,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
+; AVX512VBMIVL-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512VBMIVL-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0
 ; AVX512VBMIVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX512VBMIVL-NEXT:    vzeroupper

diff  --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index 9a1d3ad7733a2..6e357a5fb34f5 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -382,7 +382,7 @@ define <4 x double> @PR34175(ptr %p) {
 ;
 ; AVX512BW-LABEL: PR34175:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm0 = <0,8,32,40,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [0,8,32,40,0,8,32,40,0,8,32,40,0,8,32,40]
 ; AVX512BW-NEXT:    vmovdqu (%rdi), %ymm1
 ; AVX512BW-NEXT:    vmovdqu 32(%rdi), %ymm2
 ; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
@@ -392,7 +392,7 @@ define <4 x double> @PR34175(ptr %p) {
 ;
 ; AVX512BWVL-LABEL: PR34175:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,8,16,24,u,u,u,u>
+; AVX512BWVL-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [0,8,16,24,0,8,16,24]
 ; AVX512BWVL-NEXT:    vmovdqu (%rdi), %ymm1
 ; AVX512BWVL-NEXT:    vpermt2w 32(%rdi), %ymm0, %ymm1
 ; AVX512BWVL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
@@ -401,7 +401,7 @@ define <4 x double> @PR34175(ptr %p) {
 ;
 ; AVX512VBMI-LABEL: PR34175:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} ymm0 = <0,8,32,40,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMI-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [0,8,32,40,0,8,32,40,0,8,32,40,0,8,32,40]
 ; AVX512VBMI-NEXT:    vmovdqu (%rdi), %ymm1
 ; AVX512VBMI-NEXT:    vmovdqu 32(%rdi), %ymm2
 ; AVX512VBMI-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
@@ -411,7 +411,7 @@ define <4 x double> @PR34175(ptr %p) {
 ;
 ; AVX512VBMIVL-LABEL: PR34175:
 ; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,8,16,24,u,u,u,u>
+; AVX512VBMIVL-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [0,8,16,24,0,8,16,24]
 ; AVX512VBMIVL-NEXT:    vmovdqu (%rdi), %ymm1
 ; AVX512VBMIVL-NEXT:    vpermt2w 32(%rdi), %ymm0, %ymm1
 ; AVX512VBMIVL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero

diff  --git a/llvm/test/CodeGen/X86/slow-pmulld.ll b/llvm/test/CodeGen/X86/slow-pmulld.ll
index 99024f6bba218..8e330c3bfc676 100644
--- a/llvm/test/CodeGen/X86/slow-pmulld.ll
+++ b/llvm/test/CodeGen/X86/slow-pmulld.ll
@@ -347,7 +347,7 @@ define <4 x i32> @test_mul_v4i32_v4i16(<4 x i16> %A) {
 ;
 ; AVX2-SLOW-LABEL: test_mul_v4i32_v4i16:
 ; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
 ; AVX2-SLOW-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm2
 ; AVX2-SLOW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
@@ -407,7 +407,7 @@ define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) {
 ;
 ; AVX2-SLOW-LABEL: test_mul_v8i32_v8i16:
 ; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
 ; AVX2-SLOW-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm2
 ; AVX2-SLOW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
@@ -490,7 +490,7 @@ define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) {
 ;
 ; AVX2-SLOW-LABEL: test_mul_v16i32_v16i16:
 ; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778]
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778]
 ; AVX2-SLOW-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm2
 ; AVX2-SLOW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]

diff  --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index f52132587c1df..fb4ad4a61426f 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -2434,7 +2434,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
 ; CHECK-AVX2-NEXT:    vpackuswb %ymm3, %ymm4, %ymm3
 ; CHECK-AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; CHECK-AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; CHECK-AVX2-NEXT:    vpand %ymm5, %ymm4, %ymm4
 ; CHECK-AVX2-NEXT:    vpshufb {{.*#+}} ymm6 = ymm0[8],zero,ymm0[9],zero,zero,zero,ymm0[11],zero,zero,zero,ymm0[13],zero,zero,zero,ymm0[15],zero,zero,zero,ymm0[25],zero,zero,zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,zero,zero,ymm0[31],zero
 ; CHECK-AVX2-NEXT:    vpackuswb %ymm6, %ymm4, %ymm4
@@ -2471,7 +2471,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
 ; CHECK-AVX512VL:       # %bb.0:
 ; CHECK-AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; CHECK-AVX512VL-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; CHECK-AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; CHECK-AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm2
 ; CHECK-AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; CHECK-AVX512VL-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4

diff  --git a/llvm/test/CodeGen/X86/sshl_sat_vec.ll b/llvm/test/CodeGen/X86/sshl_sat_vec.ll
index 72a3e74ff0a7f..531297af2a309 100644
--- a/llvm/test/CodeGen/X86/sshl_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sshl_sat_vec.ll
@@ -353,7 +353,7 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 ; X64-AVX2-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm1
 ; X64-AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; X64-AVX2-NEXT:    vpcmpgtw %xmm0, %xmm3, %xmm0
-; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [32767,32767,32767,32767,32767,32767,32767,32767]
 ; X64-AVX2-NEXT:    vpblendvb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm0
 ; X64-AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vzeroupper
@@ -622,7 +622,7 @@ define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; X64-AVX2-NEXT:    vpsrlw $8, %xmm1, %xmm1
 ; X64-AVX2-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
 ; X64-AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
 ; X64-AVX2-NEXT:    vpblendvb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm0
 ; X64-AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
index 21f1fd6c8da21..056cb2c28c00d 100644
--- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
@@ -530,14 +530,14 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
 ; AVX2-LABEL: v16i4:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpsllw $4, %xmm1, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
 ; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsllw $4, %xmm0, %xmm0
 ; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
@@ -545,14 +545,14 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
 ; AVX512F-LABEL: v16i4:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsllw $4, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
 ; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpsllw $4, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpsrlw $4, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    retq
@@ -560,13 +560,13 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
 ; AVX512BW-LABEL: v16i4:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %xmm1, %xmm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
 ; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsllw $4, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; AVX512BW-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
 ; AVX512BW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/uadd_sat_vec.ll b/llvm/test/CodeGen/X86/uadd_sat_vec.ll
index 234259de2ad62..b99e13996d497 100644
--- a/llvm/test/CodeGen/X86/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/uadd_sat_vec.ll
@@ -490,14 +490,32 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
 ; SSE-NEXT:    pminub %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: v16i4:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpminub %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: v16i4:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v16i4:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v16i4:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %z = call <16 x i4> @llvm.uadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y)
   ret <16 x i4> %z
 }
@@ -906,7 +924,7 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ;
 ; AVX2-LABEL: v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
 ; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm1

diff  --git a/llvm/test/CodeGen/X86/umax.ll b/llvm/test/CodeGen/X86/umax.ll
index e37760d1e0b09..2e4830723f9f2 100644
--- a/llvm/test/CodeGen/X86/umax.ll
+++ b/llvm/test/CodeGen/X86/umax.ll
@@ -369,14 +369,23 @@ define <2 x i64> @test_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE-NEXT:    por %xmm3, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_v2i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm3
-; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm2
-; AVX-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: test_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
+; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; X86-LABEL: test_v2i64:
 ; X86:       # %bb.0:

diff  --git a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll
index f7878adbd3c95..30441fa4d1f9b 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll
@@ -238,7 +238,7 @@ define <2 x i1> @t3_wide(<2 x i64> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: t3_wide:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [12297829382473034411,12297829382473034411]
+; CHECK-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [12297829382473034411,12297829382473034411]
 ; CHECK-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
 ; CHECK-AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm3
 ; CHECK-AVX2-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1

diff  --git a/llvm/test/CodeGen/X86/usub_sat_vec.ll b/llvm/test/CodeGen/X86/usub_sat_vec.ll
index a49f383e82631..705019bc61ac8 100644
--- a/llvm/test/CodeGen/X86/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/usub_sat_vec.ll
@@ -489,13 +489,29 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
 ; SSE-NEXT:    psubusb %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: v16i4:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: v16i4:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v16i4:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v16i4:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %z = call <16 x i4> @llvm.usub.sat.v16i4(<16 x i4> %x, <16 x i4> %y)
   ret <16 x i4> %z
 }
@@ -817,7 +833,7 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ;
 ; AVX2-LABEL: v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -1112,7 +1128,7 @@ define void @PR48223(ptr %p0) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
 ; AVX2-NEXT:    vpsubusw %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsubusw %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rdi)
@@ -1124,7 +1140,7 @@ define void @PR48223(ptr %p0) {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
 ; AVX512F-NEXT:    vpsubusw %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsubusw %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rdi)

diff  --git a/llvm/test/CodeGen/X86/vec_anyext.ll b/llvm/test/CodeGen/X86/vec_anyext.ll
index edba0caabc15f..020f8ea277a89 100644
--- a/llvm/test/CodeGen/X86/vec_anyext.ll
+++ b/llvm/test/CodeGen/X86/vec_anyext.ll
@@ -189,7 +189,7 @@ define <4 x i8> @func_8_64(ptr %a, ptr %b) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovdqa (%rdi), %xmm0
 ; X64-NEXT:    vmovdqa 16(%rdi), %xmm1
-; X64-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; X64-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; X64-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; X64-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; X64-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]

diff  --git a/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll b/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll
index cc130fe3427f2..1cff56efba91a 100644
--- a/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll
+++ b/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll
@@ -342,7 +342,7 @@ define <2 x i64> @ge_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ;
 ; AVX2-LABEL: ge_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
@@ -516,7 +516,7 @@ define <2 x i64> @gt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ;
 ; AVX2-LABEL: gt_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
@@ -764,7 +764,7 @@ define <2 x i64> @le_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ;
 ; AVX2-LABEL: le_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
@@ -939,7 +939,7 @@ define <2 x i64> @lt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ;
 ; AVX2-LABEL: lt_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0

diff  --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
index 24e05bd937b0c..7b1fd084dee06 100644
--- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
@@ -52,14 +52,23 @@ define <2 x float> @uitofp_2i32_to_2f32(<2 x i32> %a) {
 ; SSE41-NEXT:    cvtpd2ps %xmm0, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; VEX-LABEL: uitofp_2i32_to_2f32:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; VEX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
-; VEX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; VEX-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
-; VEX-NEXT:    vcvtpd2ps %xmm0, %xmm0
-; VEX-NEXT:    retq
+; AVX1-LABEL: uitofp_2i32_to_2f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vcvtpd2ps %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_2i32_to_2f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vcvtpd2ps %xmm0, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: uitofp_2i32_to_2f32:
 ; AVX512F:       # %bb.0:
@@ -667,13 +676,21 @@ define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) {
 ; SSE41-NEXT:    subpd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; VEX-LABEL: uitofp_2i32_to_2f64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; VEX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
-; VEX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; VEX-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
-; VEX-NEXT:    retq
+; AVX1-LABEL: uitofp_2i32_to_2f64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_2i32_to_2f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: uitofp_2i32_to_2f64:
 ; AVX512F:       # %bb.0:
@@ -3343,13 +3360,21 @@ define <2 x double> @uitofp_load_2i32_to_2f64(ptr%a) {
 ; SSE41-NEXT:    subpd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; VEX-LABEL: uitofp_load_2i32_to_2f64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
-; VEX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
-; VEX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; VEX-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
-; VEX-NEXT:    retq
+; AVX1-LABEL: uitofp_load_2i32_to_2f64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_load_2i32_to_2f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: uitofp_load_2i32_to_2f64:
 ; AVX512F:       # %bb.0:
@@ -5663,10 +5688,10 @@ define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 {
 ; AVX2-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
 ; AVX2-NEXT:    vpor %xmm4, %xmm3, %xmm3
 ; AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
 ; AVX2-NEXT:    vpor %xmm5, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovddup {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
 ; AVX2-NEXT:    # xmm6 = mem[0,0]
@@ -5691,10 +5716,10 @@ define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 {
 ; AVX512F-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
 ; AVX512F-NEXT:    vpor %xmm4, %xmm3, %xmm3
 ; AVX512F-NEXT:    vpsrlq $32, %xmm0, %xmm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
 ; AVX512F-NEXT:    vpor %xmm5, %xmm0, %xmm0
 ; AVX512F-NEXT:    vmovddup {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
 ; AVX512F-NEXT:    # xmm6 = mem[0,0]

diff  --git a/llvm/test/CodeGen/X86/vec_minmax_uint.ll b/llvm/test/CodeGen/X86/vec_minmax_uint.ll
index 49adfbf5acfd0..76faaca5912e7 100644
--- a/llvm/test/CodeGen/X86/vec_minmax_uint.ll
+++ b/llvm/test/CodeGen/X86/vec_minmax_uint.ll
@@ -71,7 +71,7 @@ define <2 x i64> @max_gt_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ;
 ; AVX2-LABEL: max_gt_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -477,7 +477,7 @@ define <2 x i64> @max_ge_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ;
 ; AVX2-LABEL: max_ge_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -882,7 +882,7 @@ define <2 x i64> @min_lt_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ;
 ; AVX2-LABEL: min_lt_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -1290,7 +1290,7 @@ define <2 x i64> @min_le_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ;
 ; AVX2-LABEL: min_le_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2

diff  --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index 19c32d786344c..eb8627e89887d 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -1816,7 +1816,7 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, ptr %p2) nounwind {
 ; AVX2-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm1
 ; AVX2-NEXT:    vpackuswb %ymm4, %ymm1, %ymm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
 ; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpackuswb %ymm3, %ymm0, %ymm4
@@ -2546,7 +2546,7 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
 ; AVX2-NEXT:    vpmulhw %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm3
 ; AVX2-NEXT:    vpackuswb %ymm6, %ymm3, %ymm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpand %ymm6, %ymm5, %ymm5
 ; AVX2-NEXT:    vpand %ymm6, %ymm1, %ymm1
 ; AVX2-NEXT:    vpackuswb %ymm5, %ymm1, %ymm1
@@ -2666,7 +2666,7 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
 ; AVX512BW-NEXT:    vpmulhw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpackuswb %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512BW-NEXT:    vpandq %zmm2, %zmm3, %zmm3
 ; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm3, %zmm0, %zmm4

diff  --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll
index 32d2332fd3839..3ee92921b070e 100644
--- a/llvm/test/CodeGen/X86/vec_uaddo.ll
+++ b/llvm/test/CodeGen/X86/vec_uaddo.ll
@@ -855,16 +855,27 @@ define <2 x i32> @uaddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
 ; SSE-NEXT:    movdqa %xmm1, (%rdi)
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: uaddo_v2i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm3
-; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm0
-; AVX-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX-NEXT:    vmovdqa %xmm1, (%rdi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: uaddo_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm0
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uaddo_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm0
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: uaddo_v2i64:
 ; AVX512:       # %bb.0:

diff  --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll
index e792fb9a8b271..3e8ee21cef40b 100644
--- a/llvm/test/CodeGen/X86/vec_umulo.ll
+++ b/llvm/test/CodeGen/X86/vec_umulo.ll
@@ -1555,7 +1555,7 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, ptr %p2) nounwind {
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
 ; AVX2-NEXT:    vpmullw %ymm3, %ymm4, %ymm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm5
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
@@ -2216,7 +2216,7 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31]
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15],ymm0[24],ymm4[24],ymm0[25],ymm4[25],ymm0[26],ymm4[26],ymm0[27],ymm4[27],ymm0[28],ymm4[28],ymm0[29],ymm4[29],ymm0[30],ymm4[30],ymm0[31],ymm4[31]
 ; AVX2-NEXT:    vpmullw %ymm5, %ymm6, %ymm5
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpand %ymm6, %ymm5, %ymm7
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23]
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[16],ymm4[16],ymm0[17],ymm4[17],ymm0[18],ymm4[18],ymm0[19],ymm4[19],ymm0[20],ymm4[20],ymm0[21],ymm4[21],ymm0[22],ymm4[22],ymm0[23],ymm4[23]
@@ -2322,7 +2322,7 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm0[8],zmm2[8],zmm0[9],zmm2[9],zmm0[10],zmm2[10],zmm0[11],zmm2[11],zmm0[12],zmm2[12],zmm0[13],zmm2[13],zmm0[14],zmm2[14],zmm0[15],zmm2[15],zmm0[24],zmm2[24],zmm0[25],zmm2[25],zmm0[26],zmm2[26],zmm0[27],zmm2[27],zmm0[28],zmm2[28],zmm0[29],zmm2[29],zmm0[30],zmm2[30],zmm0[31],zmm2[31],zmm0[40],zmm2[40],zmm0[41],zmm2[41],zmm0[42],zmm2[42],zmm0[43],zmm2[43],zmm0[44],zmm2[44],zmm0[45],zmm2[45],zmm0[46],zmm2[46],zmm0[47],zmm2[47],zmm0[56],zmm2[56],zmm0[57],zmm2[57],zmm0[58],zmm2[58],zmm0[59],zmm2[59],zmm0[60],zmm2[60],zmm0[61],zmm2[61],zmm0[62],zmm2[62],zmm0[63],zmm2[63]
 ; AVX512BW-NEXT:    vpmullw %zmm3, %zmm4, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512BW-NEXT:    vpandq %zmm4, %zmm3, %zmm5
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm2[0],zmm0[1],zmm2[1],zmm0[2],zmm2[2],zmm0[3],zmm2[3],zmm0[4],zmm2[4],zmm0[5],zmm2[5],zmm0[6],zmm2[6],zmm0[7],zmm2[7],zmm0[16],zmm2[16],zmm0[17],zmm2[17],zmm0[18],zmm2[18],zmm0[19],zmm2[19],zmm0[20],zmm2[20],zmm0[21],zmm2[21],zmm0[22],zmm2[22],zmm0[23],zmm2[23],zmm0[32],zmm2[32],zmm0[33],zmm2[33],zmm0[34],zmm2[34],zmm0[35],zmm2[35],zmm0[36],zmm2[36],zmm0[37],zmm2[37],zmm0[38],zmm2[38],zmm0[39],zmm2[39],zmm0[48],zmm2[48],zmm0[49],zmm2[49],zmm0[50],zmm2[50],zmm0[51],zmm2[51],zmm0[52],zmm2[52],zmm0[53],zmm2[53],zmm0[54],zmm2[54],zmm0[55],zmm2[55]

diff  --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll
index 6f63236206e0c..49d169c896507 100644
--- a/llvm/test/CodeGen/X86/vec_usubo.ll
+++ b/llvm/test/CodeGen/X86/vec_usubo.ll
@@ -902,16 +902,27 @@ define <2 x i32> @usubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
 ; SSE-NEXT:    movdqa %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: usubo_v2i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm3
-; AVX-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm0
-; AVX-NEXT:    vpcmpgtq %xmm3, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX-NEXT:    vmovdqa %xmm1, (%rdi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: usubo_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm0
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: usubo_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm0
+; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: usubo_v2i64:
 ; AVX512:       # %bb.0:

diff  --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll
index 3d98cc95ad05c..ace5b3da1d3f5 100644
--- a/llvm/test/CodeGen/X86/vector-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll
@@ -447,18 +447,44 @@ define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind {
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
-; AVX-LABEL: test_bitreverse_v16i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
-; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: test_bitreverse_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX1-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_bitreverse_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_bitreverse_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vpor %xmm0, %xmm2, %xmm0
+; AVX512-NEXT:    retq
 ;
 ; XOP-LABEL: test_bitreverse_v16i8:
 ; XOP:       # %bb.0:
@@ -524,19 +550,47 @@ define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
-; AVX-LABEL: test_bitreverse_v8i16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
-; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: test_bitreverse_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX1-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_bitreverse_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_bitreverse_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX512-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vpor %xmm0, %xmm2, %xmm0
+; AVX512-NEXT:    retq
 ;
 ; XOP-LABEL: test_bitreverse_v8i16:
 ; XOP:       # %bb.0:
@@ -609,19 +663,47 @@ define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
-; AVX-LABEL: test_bitreverse_v4i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
-; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: test_bitreverse_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX1-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_bitreverse_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_bitreverse_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; AVX512-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vpor %xmm0, %xmm2, %xmm0
+; AVX512-NEXT:    retq
 ;
 ; XOP-LABEL: test_bitreverse_v4i32:
 ; XOP:       # %bb.0:
@@ -696,19 +778,47 @@ define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind {
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
-; AVX-LABEL: test_bitreverse_v2i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
-; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: test_bitreverse_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX1-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_bitreverse_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_bitreverse_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; AVX512-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vpor %xmm0, %xmm2, %xmm0
+; AVX512-NEXT:    retq
 ;
 ; XOP-LABEL: test_bitreverse_v2i64:
 ; XOP:       # %bb.0:
@@ -822,26 +932,30 @@ define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
 ;
 ; AVX2-LABEL: test_bitreverse_v32i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_bitreverse_v32i8:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX512-NEXT:    retq
@@ -998,13 +1112,15 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
 ; AVX2-LABEL: test_bitreverse_v16i16:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
@@ -1012,13 +1128,15 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
 ; AVX512-LABEL: test_bitreverse_v16i16:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX512-NEXT:    retq
@@ -1194,13 +1312,15 @@ define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
 ; AVX2-LABEL: test_bitreverse_v8i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
@@ -1208,13 +1328,15 @@ define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
 ; AVX512-LABEL: test_bitreverse_v8i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX512-NEXT:    retq
@@ -1394,13 +1516,15 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
 ; AVX2-LABEL: test_bitreverse_v4i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
@@ -1408,13 +1532,15 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
 ; AVX512-LABEL: test_bitreverse_v4i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX512-NEXT:    retq
@@ -1635,13 +1761,15 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
 ;
 ; AVX2-LABEL: test_bitreverse_v64i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
 ; AVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
 ; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
@@ -1655,16 +1783,18 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
 ; AVX512F-LABEL: test_bitreverse_v64i8:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm5
 ; AVX512F-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
@@ -1675,13 +1805,15 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
 ;
 ; AVX512BW-LABEL: test_bitreverse_v64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
@@ -1931,15 +2063,18 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
 ;
 ; AVX2-LABEL: test_bitreverse_v32i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
 ; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
@@ -1954,11 +2089,13 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
 ; AVX512F-LABEL: test_bitreverse_v32i16:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX512F-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm4
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512F-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm2
@@ -1966,7 +2103,8 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
@@ -1978,13 +2116,15 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
 ; AVX512BW-LABEL: test_bitreverse_v32i16:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62]
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
@@ -2047,7 +2187,8 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
 ;
 ; GFNIAVX2-LABEL: test_bitreverse_v32i16:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; GFNIAVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; GFNIAVX2-NEXT:    # ymm2 = mem[0,1,0,1]
 ; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
@@ -2058,7 +2199,8 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
 ; GFNIAVX512F-LABEL: test_bitreverse_v32i16:
 ; GFNIAVX512F:       # %bb.0:
 ; GFNIAVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; GFNIAVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; GFNIAVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; GFNIAVX512F-NEXT:    # ymm2 = mem[0,1,0,1]
 ; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; GFNIAVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -2279,15 +2421,18 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
 ;
 ; AVX2-LABEL: test_bitreverse_v16i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
 ; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
@@ -2302,11 +2447,13 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
 ; AVX512F-LABEL: test_bitreverse_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; AVX512F-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm4
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512F-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm2
@@ -2314,7 +2461,8 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
@@ -2326,13 +2474,15 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
 ; AVX512BW-LABEL: test_bitreverse_v16i32:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60]
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
@@ -2395,7 +2545,8 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
 ;
 ; GFNIAVX2-LABEL: test_bitreverse_v16i32:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; GFNIAVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; GFNIAVX2-NEXT:    # ymm2 = mem[0,1,0,1]
 ; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
@@ -2406,7 +2557,8 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
 ; GFNIAVX512F-LABEL: test_bitreverse_v16i32:
 ; GFNIAVX512F:       # %bb.0:
 ; GFNIAVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; GFNIAVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; GFNIAVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; GFNIAVX512F-NEXT:    # ymm2 = mem[0,1,0,1]
 ; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; GFNIAVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -2635,15 +2787,18 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
 ;
 ; AVX2-LABEL: test_bitreverse_v8i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
 ; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
@@ -2658,11 +2813,13 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
 ; AVX512F-LABEL: test_bitreverse_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; AVX512F-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm4
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512F-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm2
@@ -2670,7 +2827,8 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
@@ -2682,13 +2840,15 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
 ; AVX512BW-LABEL: test_bitreverse_v8i64:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56]
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
@@ -2751,7 +2911,8 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
 ;
 ; GFNIAVX2-LABEL: test_bitreverse_v8i64:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; GFNIAVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; GFNIAVX2-NEXT:    # ymm2 = mem[0,1,0,1]
 ; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
@@ -2762,7 +2923,8 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
 ; GFNIAVX512F-LABEL: test_bitreverse_v8i64:
 ; GFNIAVX512F:       # %bb.0:
 ; GFNIAVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; GFNIAVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; GFNIAVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; GFNIAVX512F-NEXT:    # ymm2 = mem[0,1,0,1]
 ; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; GFNIAVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0

diff  --git a/llvm/test/CodeGen/X86/vector-blend.ll b/llvm/test/CodeGen/X86/vector-blend.ll
index 2271db9d64038..eaa3790e4357e 100644
--- a/llvm/test/CodeGen/X86/vector-blend.ll
+++ b/llvm/test/CodeGen/X86/vector-blend.ll
@@ -84,11 +84,17 @@ define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: vsel_4xi8:
-; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <255,255,0,255,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: vsel_4xi8:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <255,255,0,255,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: vsel_4xi8:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255]
+; AVX2-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
 entry:
   %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i8> %v1, <4 x i8> %v2
   ret <4 x i8> %vsel
@@ -262,11 +268,17 @@ define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) {
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: vsel_i8:
-; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
-; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: vsel_i8:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: vsel_i8:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
+; AVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
 entry:
   %vsel = select <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <16 x i8> %v1, <16 x i8> %v2
   ret <16 x i8> %vsel
@@ -627,7 +639,7 @@ define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
 ;
 ; AVX2-LABEL: constant_pblendvb_avx2:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 entry:

diff  --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll
index caf4efbbf32c6..2feafb8950111 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll
@@ -84,7 +84,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
 ;
 ; AVX2-LABEL: var_funnnel_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
 ; AVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; AVX2-NEXT:    vpsrlq $1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
@@ -95,7 +95,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
 ;
 ; AVX512F-LABEL: var_funnnel_v2i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
 ; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; AVX512F-NEXT:    vpsrlq $1, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
@@ -117,7 +117,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
 ;
 ; AVX512BW-LABEL: var_funnnel_v2i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
 ; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; AVX512BW-NEXT:    vpsrlq $1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
@@ -167,7 +167,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
 ;
 ; XOPAVX2-LABEL: var_funnnel_v2i64:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
 ; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; XOPAVX2-NEXT:    vpsrlq $1, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
@@ -547,7 +547,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
 ; AVX512BW-LABEL: var_funnnel_v8i16:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; AVX512BW-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
@@ -569,7 +569,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
 ;
 ; AVX512VLBW-LABEL: var_funnnel_v8i16:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
 ; AVX512VLBW-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; AVX512VLBW-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpsrlvw %xmm4, %xmm1, %xmm1
@@ -583,18 +583,31 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
 ; AVX512VLVBMI2-NEXT:    vpshldvw %xmm2, %xmm1, %xmm0
 ; AVX512VLVBMI2-NEXT:    retq
 ;
-; XOP-LABEL: var_funnnel_v8i16:
-; XOP:       # %bb.0:
-; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; XOP-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; XOP-NEXT:    vpshlw %xmm4, %xmm0, %xmm0
-; XOP-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; XOP-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; XOP-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
-; XOP-NEXT:    vpsrlw $1, %xmm1, %xmm1
-; XOP-NEXT:    vpshlw %xmm2, %xmm1, %xmm1
-; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; XOP-NEXT:    retq
+; XOPAVX1-LABEL: var_funnnel_v8i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT:    vpshlw %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpshlw %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_funnnel_v8i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; XOPAVX2-NEXT:    vpshlw %xmm4, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpshlw %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
 ;
 ; X86-SSE2-LABEL: var_funnnel_v8i16:
 ; X86-SSE2:       # %bb.0:
@@ -779,7 +792,7 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
 ;
 ; AVX512F-LABEL: var_funnnel_v16i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
@@ -797,7 +810,7 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
 ;
 ; AVX512VL-LABEL: var_funnnel_v16i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
 ; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
@@ -871,19 +884,33 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
 ; AVX512VLVBMI2-NEXT:    vzeroupper
 ; AVX512VLVBMI2-NEXT:    retq
 ;
-; XOP-LABEL: var_funnnel_v16i8:
-; XOP:       # %bb.0:
-; XOP-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; XOP-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
-; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; XOP-NEXT:    vpandn %xmm3, %xmm2, %xmm4
-; XOP-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; XOP-NEXT:    vpsubb %xmm4, %xmm5, %xmm4
-; XOP-NEXT:    vpshlb %xmm4, %xmm1, %xmm1
-; XOP-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; XOP-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
-; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; XOP-NEXT:    retq
+; XOPAVX1-LABEL: var_funnnel_v16i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpsubb %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT:    vpshlb %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_funnnel_v16i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm4
+; XOPAVX2-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX2-NEXT:    vpsubb %xmm4, %xmm5, %xmm4
+; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
 ;
 ; X86-SSE2-LABEL: var_funnnel_v16i8:
 ; X86-SSE2:       # %bb.0:
@@ -952,20 +979,31 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 ; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: splatvar_funnnel_v2i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX-NEXT:    vpandn %xmm3, %xmm2, %xmm4
-; AVX-NEXT:    vpsrlq $1, %xmm1, %xmm1
-; AVX-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
-; AVX-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: splatvar_funnnel_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm4
+; AVX2-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: splatvar_funnnel_v2i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
 ; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; AVX512F-NEXT:    vpsrlq $1, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
@@ -987,7 +1025,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 ;
 ; AVX512BW-LABEL: splatvar_funnnel_v2i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
 ; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; AVX512BW-NEXT:    vpsrlq $1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
@@ -1023,16 +1061,27 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 ; AVX512VLVBMI2-NEXT:    vpshldvq %xmm2, %xmm1, %xmm0
 ; AVX512VLVBMI2-NEXT:    retq
 ;
-; XOP-LABEL: splatvar_funnnel_v2i64:
-; XOP:       # %bb.0:
-; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
-; XOP-NEXT:    vpandn %xmm3, %xmm2, %xmm4
-; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm1
-; XOP-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
-; XOP-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; XOP-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
-; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; XOP-NEXT:    retq
+; XOPAVX1-LABEL: splatvar_funnnel_v2i64:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v2i64:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm4
+; XOPAVX2-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
 ;
 ; X86-SSE2-LABEL: splatvar_funnnel_v2i64:
 ; X86-SSE2:       # %bb.0:

diff  --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll
index 0500d6ec6e1f9..13cbd9520adbe 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll
@@ -395,7 +395,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
 ; AVX512BW-LABEL: var_funnnel_v16i16:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandn %ymm3, %ymm2, %ymm4
 ; AVX512BW-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
@@ -415,7 +415,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
 ;
 ; AVX512VLBW-LABEL: var_funnnel_v16i16:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VLBW-NEXT:    vpandn %ymm3, %ymm2, %ymm4
 ; AVX512VLBW-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512VLBW-NEXT:    vpsrlvw %ymm4, %ymm1, %ymm1
@@ -451,7 +451,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
 ;
 ; XOPAVX2-LABEL: var_funnnel_v16i16:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; XOPAVX2-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; XOPAVX2-NEXT:    vpand %ymm3, %ymm2, %ymm4
 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm6
@@ -573,11 +573,11 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
 ;
 ; AVX512F-LABEL: var_funnnel_v32i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; AVX512F-NEXT:    vpandn %ymm3, %ymm2, %ymm4
 ; AVX512F-NEXT:    vpsllw $5, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
 ; AVX512F-NEXT:    vpand %ymm5, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm6
 ; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6
@@ -607,11 +607,11 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
 ;
 ; AVX512VL-LABEL: var_funnnel_v32i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; AVX512VL-NEXT:    vpandn %ymm3, %ymm2, %ymm4
 ; AVX512VL-NEXT:    vpsllw $5, %ymm4, %ymm4
 ; AVX512VL-NEXT:    vpsrlw $1, %ymm1, %ymm1
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
 ; AVX512VL-NEXT:    vpand %ymm5, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm6
 ; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm6, %ymm6
@@ -719,7 +719,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
 ; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm3, %xmm3
 ; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm2, %xmm5
-; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [249,249,249,249,249,249,249,249,249,249,249,249,249,249,249,249]
+; XOPAVX2-NEXT:    vpbroadcastb {{.*#+}} xmm6 = [249,249,249,249,249,249,249,249,249,249,249,249,249,249,249,249]
 ; XOPAVX2-NEXT:    vpaddb %xmm6, %xmm5, %xmm7
 ; XOPAVX2-NEXT:    vpshlb %xmm7, %xmm3, %xmm3
 ; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm1, %xmm1
@@ -761,7 +761,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
 ;
 ; AVX2-LABEL: splatvar_funnnel_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
 ; AVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1
@@ -772,7 +772,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
 ;
 ; AVX512F-LABEL: splatvar_funnnel_v4i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
 ; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; AVX512F-NEXT:    vpsrlq $1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1
@@ -794,7 +794,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
 ;
 ; AVX512BW-LABEL: splatvar_funnnel_v4i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
 ; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; AVX512BW-NEXT:    vpsrlq $1, %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1
@@ -849,7 +849,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
 ;
 ; XOPAVX2-LABEL: splatvar_funnnel_v4i64:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
 ; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; XOPAVX2-NEXT:    vpsrlq $1, %ymm1, %ymm1
 ; XOPAVX2-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1

diff  --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll
index 3ffd137921d7d..8cb042c858964 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll
@@ -184,7 +184,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %
 ;
 ; AVX512BW-LABEL: var_funnnel_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandnq %zmm3, %zmm2, %zmm4
 ; AVX512BW-NEXT:    vpsrlw $1, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
@@ -200,7 +200,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %
 ;
 ; AVX512VLBW-LABEL: var_funnnel_v32i16:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VLBW-NEXT:    vpandnq %zmm3, %zmm2, %zmm4
 ; AVX512VLBW-NEXT:    vpsrlw $1, %zmm1, %zmm1
 ; AVX512VLBW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
@@ -222,19 +222,19 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
 ; AVX512F-NEXT:    vpsrlw $1, %ymm3, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
 ; AVX512F-NEXT:    vpand %ymm4, %ymm3, %ymm5
 ; AVX512F-NEXT:    vpsrlw $4, %ymm5, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm6, %ymm3, %ymm7
 ; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; AVX512F-NEXT:    vpxor %ymm3, %ymm8, %ymm9
 ; AVX512F-NEXT:    vpsllw $5, %ymm9, %ymm9
 ; AVX512F-NEXT:    vpblendvb %ymm9, %ymm7, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpsrlw $2, %ymm5, %ymm7
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
 ; AVX512F-NEXT:    vpand %ymm7, %ymm10, %ymm7
 ; AVX512F-NEXT:    vpaddb %ymm9, %ymm9, %ymm9
 ; AVX512F-NEXT:    vpblendvb %ymm9, %ymm7, %ymm5, %ymm5
@@ -260,12 +260,12 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
 ; AVX512F-NEXT:    vpsllw $4, %ymm4, %ymm5
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
 ; AVX512F-NEXT:    vpand %ymm6, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpsllw $5, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpsllw $2, %ymm4, %ymm5
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
 ; AVX512F-NEXT:    vpand %ymm7, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
@@ -291,19 +291,19 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
 ; AVX512VL-NEXT:    vpsrlw $1, %ymm3, %ymm3
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
 ; AVX512VL-NEXT:    vpand %ymm4, %ymm3, %ymm5
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm5, %ymm3
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VL-NEXT:    vpand %ymm6, %ymm3, %ymm7
 ; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; AVX512VL-NEXT:    vpxor %ymm3, %ymm8, %ymm9
 ; AVX512VL-NEXT:    vpsllw $5, %ymm9, %ymm9
 ; AVX512VL-NEXT:    vpblendvb %ymm9, %ymm7, %ymm5, %ymm5
 ; AVX512VL-NEXT:    vpsrlw $2, %ymm5, %ymm7
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
 ; AVX512VL-NEXT:    vpand %ymm7, %ymm10, %ymm7
 ; AVX512VL-NEXT:    vpaddb %ymm9, %ymm9, %ymm9
 ; AVX512VL-NEXT:    vpblendvb %ymm9, %ymm7, %ymm5, %ymm5
@@ -329,12 +329,12 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
 ; AVX512VL-NEXT:    vpsllw $4, %ymm4, %ymm5
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
 ; AVX512VL-NEXT:    vpand %ymm6, %ymm5, %ymm5
 ; AVX512VL-NEXT:    vpsllw $5, %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
 ; AVX512VL-NEXT:    vpsllw $2, %ymm4, %ymm5
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
 ; AVX512VL-NEXT:    vpand %ymm7, %ymm5, %ymm5
 ; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
@@ -426,7 +426,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
 define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind {
 ; AVX512F-LABEL: splatvar_funnnel_v8i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
 ; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; AVX512F-NEXT:    vpsrlq $1, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpsrlq %xmm4, %zmm1, %zmm1
@@ -448,7 +448,7 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %
 ;
 ; AVX512BW-LABEL: splatvar_funnnel_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
 ; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; AVX512BW-NEXT:    vpsrlq $1, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpsrlq %xmm4, %zmm1, %zmm1

diff  --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
index 74b7fa84aac12..08402ab0391dd 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
@@ -87,7 +87,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind {
 ;
 ; AVX2-LABEL: var_funnnel_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [63,63]
 ; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpsllvq %xmm3, %xmm0, %xmm3
 ; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
@@ -414,7 +414,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind {
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
 ; AVX512BW-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpor %xmm0, %xmm2, %xmm0
@@ -425,7 +425,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind {
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpsllvw %xmm1, %xmm0, %xmm2
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
 ; AVX512VLBW-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
 ; AVX512VLBW-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm0
 ; AVX512VLBW-NEXT:    vpor %xmm0, %xmm2, %xmm0
@@ -737,17 +737,29 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind
 ; SSE-NEXT:    por %xmm4, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: splatvar_funnnel_v2i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
-; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm3
-; AVX-NEXT:    vpsllq %xmm3, %xmm0, %xmm3
-; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
-; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpor %xmm0, %xmm3, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: splatvar_funnnel_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpsllq %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [63,63]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpsllq %xmm3, %xmm0, %xmm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: splatvar_funnnel_v2i64:
 ; AVX512F:       # %bb.0:

diff  --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
index fc0804b7c92e2..e2fe10bfecd2b 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
@@ -318,7 +318,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX512BW-NEXT:    vpsubw %ymm1, %ymm3, %ymm1
 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpor %ymm0, %ymm2, %ymm0
@@ -328,7 +328,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
 ; AVX512VLBW-NEXT:    vpsllvw %ymm1, %ymm0, %ymm2
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX512VLBW-NEXT:    vpsubw %ymm1, %ymm3, %ymm1
 ; AVX512VLBW-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    vpor %ymm0, %ymm2, %ymm0
@@ -587,7 +587,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind
 ;
 ; AVX2-LABEL: splatvar_funnnel_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [63,63]
 ; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpsllq %xmm3, %ymm0, %ymm3
 ; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
@@ -1316,11 +1316,13 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind {
 ;
 ; AVX512BW-LABEL: constant_funnnel_v32i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
+; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vpsrlw $8, %ymm1, %ymm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
+; AVX512BW-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrlw $8, %ymm0, %ymm0
@@ -1340,11 +1342,13 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind {
 ;
 ; AVX512VBMI2-LABEL: constant_funnnel_v32i8:
 ; AVX512VBMI2:       # %bb.0:
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
+; AVX512VBMI2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
+; AVX512VBMI2-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512VBMI2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512VBMI2-NEXT:    vpsllvw %zmm1, %zmm2, %zmm1
 ; AVX512VBMI2-NEXT:    vpsrlw $8, %ymm1, %ymm1
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
+; AVX512VBMI2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
+; AVX512VBMI2-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512VBMI2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512VBMI2-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
 ; AVX512VBMI2-NEXT:    vpsrlw $8, %ymm0, %ymm0

diff  --git a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
index dd9689676edb9..81874d461fcde 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
@@ -37,7 +37,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
 ; AVX512F-LABEL: var_funnnel_v32i16:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpxor %xmm4, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
@@ -66,7 +66,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
 ; AVX512VL-LABEL: var_funnnel_v32i16:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
@@ -96,7 +96,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX512BW-NEXT:    vpsubw %zmm1, %zmm3, %zmm1
 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
@@ -106,7 +106,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
 ; AVX512VLBW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm2
-; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX512VLBW-NEXT:    vpsubw %zmm1, %zmm3, %zmm1
 ; AVX512VLBW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vporq %zmm0, %zmm2, %zmm0
@@ -143,7 +143,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
 ; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpsrlw $7, %ymm2, %ymm4
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX512F-NEXT:    vpand %ymm6, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm8
 ; AVX512F-NEXT:    vpor %ymm4, %ymm8, %ymm4
@@ -186,7 +186,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
 ; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $7, %ymm2, %ymm4
 ; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm6
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX512VL-NEXT:    vpternlogq $248, %ymm8, %ymm4, %ymm6
 ; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm6, %ymm2, %ymm2

diff  --git a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
index 537096e48b066..6fe03f54123c3 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
@@ -324,7 +324,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
 ; AVX512F-LABEL: constant_funnnel_v2i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = <4,5,u,u>
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5]
 ; AVX512F-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512F-NEXT:    vzeroupper
@@ -338,7 +338,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
 ; AVX512BW-LABEL: constant_funnnel_v2i32:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = <4,5,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5]
 ; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -352,7 +352,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
 ; AVX512VBMI2-LABEL: constant_funnnel_v2i32:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm1 = <4,5,u,u>
+; AVX512VBMI2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5]
 ; AVX512VBMI2-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    vzeroupper

diff  --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index 924de00641efb..5e6d79b059d20 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -84,7 +84,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
 ;
 ; AVX2-LABEL: var_funnnel_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
 ; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX2-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
 ; AVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
@@ -95,7 +95,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
 ;
 ; AVX512F-LABEL: var_funnnel_v2i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
 ; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512F-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm2
@@ -117,7 +117,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
 ;
 ; AVX512BW-LABEL: var_funnnel_v2i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
 ; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512BW-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
@@ -168,7 +168,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
 ;
 ; XOPAVX2-LABEL: var_funnnel_v2i64:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
 ; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; XOPAVX2-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
@@ -604,7 +604,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
 ; AVX512BW-LABEL: var_funnnel_v8i16:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
@@ -626,7 +626,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
 ;
 ; AVX512VLBW-LABEL: var_funnnel_v8i16:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
 ; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512VLBW-NEXT:    vpsrlvw %xmm4, %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
@@ -641,18 +641,31 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
 ; AVX512VLVBMI2-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512VLVBMI2-NEXT:    retq
 ;
-; XOP-LABEL: var_funnnel_v8i16:
-; XOP:       # %bb.0:
-; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; XOP-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; XOP-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; XOP-NEXT:    vpsubw %xmm4, %xmm5, %xmm4
-; XOP-NEXT:    vpshlw %xmm4, %xmm1, %xmm1
-; XOP-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; XOP-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
-; XOP-NEXT:    vpshlw %xmm2, %xmm0, %xmm0
-; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; XOP-NEXT:    retq
+; XOPAVX1-LABEL: var_funnnel_v8i16:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpsubw %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT:    vpshlw %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshlw %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_funnnel_v8i16:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; XOPAVX2-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX2-NEXT:    vpsubw %xmm4, %xmm5, %xmm4
+; XOPAVX2-NEXT:    vpshlw %xmm4, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpshlw %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
 ;
 ; X86-SSE2-LABEL: var_funnnel_v8i16:
 ; X86-SSE2:       # %bb.0:
@@ -825,41 +838,73 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: var_funnnel_v16i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX-NEXT:    vpsllw $5, %xmm4, %xmm4
-; AVX-NEXT:    vpaddb %xmm4, %xmm4, %xmm5
-; AVX-NEXT:    vpsrlw $4, %xmm1, %xmm6
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
-; AVX-NEXT:    vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
-; AVX-NEXT:    vpsrlw $2, %xmm1, %xmm4
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
-; AVX-NEXT:    vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
-; AVX-NEXT:    vpsrlw $1, %xmm1, %xmm4
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
-; AVX-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
-; AVX-NEXT:    vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
-; AVX-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX-NEXT:    vpsllw $5, %xmm2, %xmm2
-; AVX-NEXT:    vpaddb %xmm2, %xmm2, %xmm3
-; AVX-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpsllw $4, %xmm0, %xmm4
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
-; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
-; AVX-NEXT:    vpsllw $2, %xmm0, %xmm2
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
-; AVX-NEXT:    vpaddb %xmm3, %xmm3, %xmm3
-; AVX-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: var_funnnel_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vpsllw $5, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm5
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm6
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm4
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm4
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $5, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm3
+; AVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm4
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsllw $2, %xmm0, %xmm2
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vpaddb %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: var_funnnel_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX2-NEXT:    vpsllw $5, %xmm4, %xmm4
+; AVX2-NEXT:    vpaddb %xmm4, %xmm4, %xmm5
+; AVX2-NEXT:    vpsrlw $4, %xmm1, %xmm6
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
+; AVX2-NEXT:    vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrlw $2, %xmm1, %xmm4
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; AVX2-NEXT:    vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrlw $1, %xmm1, %xmm4
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; AVX2-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
+; AVX2-NEXT:    vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; AVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpsllw $5, %xmm2, %xmm2
+; AVX2-NEXT:    vpaddb %xmm2, %xmm2, %xmm3
+; AVX2-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vpsllw $4, %xmm0, %xmm4
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; AVX2-NEXT:    vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpsllw $2, %xmm0, %xmm2
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
+; AVX2-NEXT:    vpaddb %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: var_funnnel_v16i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
@@ -876,7 +921,7 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
 ;
 ; AVX512VL-LABEL: var_funnnel_v16i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
 ; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
@@ -945,18 +990,31 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
 ; AVX512VLVBMI2-NEXT:    vzeroupper
 ; AVX512VLVBMI2-NEXT:    retq
 ;
-; XOP-LABEL: var_funnnel_v16i8:
-; XOP:       # %bb.0:
-; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; XOP-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; XOP-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; XOP-NEXT:    vpsubb %xmm4, %xmm5, %xmm4
-; XOP-NEXT:    vpshlb %xmm4, %xmm1, %xmm1
-; XOP-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; XOP-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
-; XOP-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
-; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; XOP-NEXT:    retq
+; XOPAVX1-LABEL: var_funnnel_v16i8:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpsubb %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT:    vpshlb %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: var_funnnel_v16i8:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; XOPAVX2-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX2-NEXT:    vpsubb %xmm4, %xmm5, %xmm4
+; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
 ;
 ; X86-SSE2-LABEL: var_funnnel_v16i8:
 ; X86-SSE2:       # %bb.0:
@@ -1041,20 +1099,31 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 ; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: splatvar_funnnel_v2i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
-; AVX-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: splatvar_funnnel_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX2-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
+; AVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: splatvar_funnnel_v2i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
 ; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512F-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm2
@@ -1076,7 +1145,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 ;
 ; AVX512BW-LABEL: splatvar_funnnel_v2i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
 ; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512BW-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
@@ -1113,16 +1182,27 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 ; AVX512VLVBMI2-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512VLVBMI2-NEXT:    retq
 ;
-; XOP-LABEL: splatvar_funnnel_v2i64:
-; XOP:       # %bb.0:
-; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
-; XOP-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; XOP-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
-; XOP-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; XOP-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
-; XOP-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
-; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; XOP-NEXT:    retq
+; XOPAVX1-LABEL: splatvar_funnnel_v2i64:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v2i64:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; XOPAVX2-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
 ;
 ; X86-SSE2-LABEL: splatvar_funnnel_v2i64:
 ; X86-SSE2:       # %bb.0:
@@ -1376,25 +1456,38 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
 ; SSE-NEXT:    movdqa %xmm3, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: splatvar_funnnel_v16i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX-NEXT:    vpsrlw %xmm2, %xmm3, %xmm3
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; AVX-NEXT:    vpand %xmm4, %xmm3, %xmm3
-; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX-NEXT:    vpsrlw %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; AVX-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: splatvar_funnnel_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw %xmm2, %xmm3, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT:    vpsrlw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT:    vpsrlw %xmm2, %xmm3, %xmm3
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2-NEXT:    vpsrlw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: splatvar_funnnel_v16i8:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX512F-NEXT:    vpsrlw %xmm2, %xmm3, %xmm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
 ; AVX512F-NEXT:    vpand %xmm4, %xmm3, %xmm3
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; AVX512F-NEXT:    vpsrlw %xmm2, %xmm0, %xmm0
@@ -1407,7 +1500,7 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
 ; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; AVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpsrlw %xmm2, %xmm3, %xmm3
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
 ; AVX512VL-NEXT:    vpand %xmm4, %xmm3, %xmm3
 ; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; AVX512VL-NEXT:    vpsrlw %xmm2, %xmm0, %xmm0
@@ -1420,7 +1513,7 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpsrlw %xmm2, %xmm3, %xmm3
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
 ; AVX512BW-NEXT:    vpand %xmm4, %xmm3, %xmm3
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; AVX512BW-NEXT:    vpsrlw %xmm2, %xmm0, %xmm0

diff  --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index 4a580c8bacabe..624e3e8471d20 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -423,7 +423,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
 ; AVX512BW-LABEL: var_funnnel_v16i16:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm4
 ; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpandn %ymm3, %ymm2, %ymm2
@@ -443,7 +443,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
 ;
 ; AVX512VLBW-LABEL: var_funnnel_v16i16:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm4
 ; AVX512VLBW-NEXT:    vpsrlvw %ymm4, %ymm1, %ymm1
 ; AVX512VLBW-NEXT:    vpandn %ymm3, %ymm2, %ymm2
@@ -483,7 +483,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
 ;
 ; XOPAVX2-LABEL: var_funnnel_v16i16:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; XOPAVX2-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; XOPAVX2-NEXT:    vpand %ymm3, %ymm2, %ymm4
 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
 ; XOPAVX2-NEXT:    vpxor %xmm6, %xmm6, %xmm6
@@ -576,7 +576,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
 ;
 ; AVX2-LABEL: var_funnnel_v32i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm4
 ; AVX2-NEXT:    vpsllw $5, %ymm4, %ymm4
 ; AVX2-NEXT:    vpaddb %ymm4, %ymm4, %ymm5
@@ -608,7 +608,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
 ;
 ; AVX512F-LABEL: var_funnnel_v32i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm4
 ; AVX512F-NEXT:    vpsllw $5, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpaddb %ymm4, %ymm4, %ymm5
@@ -640,7 +640,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
 ;
 ; AVX512VL-LABEL: var_funnnel_v32i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm4
 ; AVX512VL-NEXT:    vpsllw $5, %ymm4, %ymm4
 ; AVX512VL-NEXT:    vpaddb %ymm4, %ymm4, %ymm5
@@ -752,7 +752,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
 ; XOPAVX2-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
 ; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm5, %ymm1, %ymm1
-; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; XOPAVX2-NEXT:    vpbroadcastb {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; XOPAVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm5
 ; XOPAVX2-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
@@ -792,7 +792,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
 ;
 ; AVX2-LABEL: splatvar_funnnel_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
 ; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX2-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1
 ; AVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
@@ -803,7 +803,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
 ;
 ; AVX512F-LABEL: splatvar_funnnel_v4i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
 ; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512F-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm2
@@ -825,7 +825,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
 ;
 ; AVX512BW-LABEL: splatvar_funnnel_v4i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
 ; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512BW-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
@@ -881,7 +881,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
 ;
 ; XOPAVX2-LABEL: splatvar_funnnel_v4i64:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
 ; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; XOPAVX2-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1
 ; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
@@ -1163,7 +1163,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX2-NEXT:    vpsrlw %xmm2, %ymm3, %ymm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 ; AVX2-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
@@ -1176,7 +1176,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
 ; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX512F-NEXT:    vpsrlw %xmm2, %ymm3, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512F-NEXT:    vpand %ymm4, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 ; AVX512F-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
@@ -1189,7 +1189,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
 ; AVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm3, %ymm3
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512VL-NEXT:    vpand %ymm4, %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 ; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
@@ -1202,7 +1202,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpsrlw %xmm2, %ymm3, %ymm3
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512BW-NEXT:    vpand %ymm4, %ymm3, %ymm3
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 ; AVX512BW-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
@@ -1227,7 +1227,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
 ; AVX512VLBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpsrlw %xmm2, %ymm3, %ymm3
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512VLBW-NEXT:    vpand %ymm4, %ymm3, %ymm3
 ; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 ; AVX512VLBW-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
@@ -1654,7 +1654,7 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
 ; AVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm0
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -1676,7 +1676,7 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -1698,7 +1698,7 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
 ; AVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512VL-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512VL-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0

diff  --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll
index 9630cc7876f6e..40f15af5887be 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll
@@ -184,7 +184,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %
 ;
 ; AVX512BW-LABEL: var_funnnel_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
 ; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpandnq %zmm3, %zmm2, %zmm2
@@ -201,7 +201,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %
 ;
 ; AVX512VLBW-LABEL: var_funnnel_v32i16:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm2, %zmm4
 ; AVX512VLBW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
 ; AVX512VLBW-NEXT:    vpandnq %zmm3, %zmm2, %zmm2
@@ -225,16 +225,16 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm4
 ; AVX512F-NEXT:    vpsllw $4, %ymm4, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
 ; AVX512F-NEXT:    vpand %ymm5, %ymm3, %ymm6
 ; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; AVX512F-NEXT:    vpxor %ymm7, %ymm3, %ymm8
 ; AVX512F-NEXT:    vpsllw $5, %ymm8, %ymm8
 ; AVX512F-NEXT:    vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpsllw $2, %ymm4, %ymm6
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
 ; AVX512F-NEXT:    vpand %ymm6, %ymm9, %ymm6
 ; AVX512F-NEXT:    vpaddb %ymm8, %ymm8, %ymm8
 ; AVX512F-NEXT:    vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
@@ -257,17 +257,17 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
 ; AVX512F-NEXT:    vpsrlw $4, %ymm4, %ymm5
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm6, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpsllw $5, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpsrlw $2, %ymm4, %ymm5
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
 ; AVX512F-NEXT:    vpand %ymm7, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpsrlw $1, %ymm4, %ymm5
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
 ; AVX512F-NEXT:    vpand %ymm5, %ymm8, %ymm5
 ; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
@@ -292,16 +292,16 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm4
 ; AVX512VL-NEXT:    vpsllw $4, %ymm4, %ymm3
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
 ; AVX512VL-NEXT:    vpand %ymm5, %ymm3, %ymm6
 ; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; AVX512VL-NEXT:    vpxor %ymm7, %ymm3, %ymm8
 ; AVX512VL-NEXT:    vpsllw $5, %ymm8, %ymm8
 ; AVX512VL-NEXT:    vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
 ; AVX512VL-NEXT:    vpsllw $2, %ymm4, %ymm6
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
 ; AVX512VL-NEXT:    vpand %ymm6, %ymm9, %ymm6
 ; AVX512VL-NEXT:    vpaddb %ymm8, %ymm8, %ymm8
 ; AVX512VL-NEXT:    vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
@@ -324,17 +324,17 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm4, %ymm5
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VL-NEXT:    vpand %ymm6, %ymm5, %ymm5
 ; AVX512VL-NEXT:    vpsllw $5, %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
 ; AVX512VL-NEXT:    vpsrlw $2, %ymm4, %ymm5
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
 ; AVX512VL-NEXT:    vpand %ymm7, %ymm5, %ymm5
 ; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
 ; AVX512VL-NEXT:    vpsrlw $1, %ymm4, %ymm5
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
 ; AVX512VL-NEXT:    vpand %ymm5, %ymm8, %ymm5
 ; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
@@ -361,7 +361,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
 ; AVX512BW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63]
 ; AVX512BW-NEXT:    vpsrlvw %zmm5, %zmm3, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512BW-NEXT:    vpandq %zmm5, %zmm3, %zmm3
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm4[0],zmm2[1],zmm4[1],zmm2[2],zmm4[2],zmm2[3],zmm4[3],zmm2[4],zmm4[4],zmm2[5],zmm4[5],zmm2[6],zmm4[6],zmm2[7],zmm4[7],zmm2[16],zmm4[16],zmm2[17],zmm4[17],zmm2[18],zmm4[18],zmm2[19],zmm4[19],zmm2[20],zmm4[20],zmm2[21],zmm4[21],zmm2[22],zmm4[22],zmm2[23],zmm4[23],zmm2[32],zmm4[32],zmm2[33],zmm4[33],zmm2[34],zmm4[34],zmm2[35],zmm4[35],zmm2[36],zmm4[36],zmm2[37],zmm4[37],zmm2[38],zmm4[38],zmm2[39],zmm4[39],zmm2[48],zmm4[48],zmm2[49],zmm4[49],zmm2[50],zmm4[50],zmm2[51],zmm4[51],zmm2[52],zmm4[52],zmm2[53],zmm4[53],zmm2[54],zmm4[54],zmm2[55],zmm4[55]
@@ -391,7 +391,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
 ; AVX512VLBW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63]
 ; AVX512VLBW-NEXT:    vpsrlvw %zmm5, %zmm3, %zmm3
-; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512VLBW-NEXT:    vpandq %zmm5, %zmm3, %zmm3
 ; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
 ; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm4[0],zmm2[1],zmm4[1],zmm2[2],zmm4[2],zmm2[3],zmm4[3],zmm2[4],zmm4[4],zmm2[5],zmm4[5],zmm2[6],zmm4[6],zmm2[7],zmm4[7],zmm2[16],zmm4[16],zmm2[17],zmm4[17],zmm2[18],zmm4[18],zmm2[19],zmm4[19],zmm2[20],zmm4[20],zmm2[21],zmm4[21],zmm2[22],zmm4[22],zmm2[23],zmm4[23],zmm2[32],zmm4[32],zmm2[33],zmm4[33],zmm2[34],zmm4[34],zmm2[35],zmm4[35],zmm2[36],zmm4[36],zmm2[37],zmm4[37],zmm2[38],zmm4[38],zmm2[39],zmm4[39],zmm2[48],zmm4[48],zmm2[49],zmm4[49],zmm2[50],zmm4[50],zmm2[51],zmm4[51],zmm2[52],zmm4[52],zmm2[53],zmm4[53],zmm2[54],zmm4[54],zmm2[55],zmm4[55]
@@ -424,7 +424,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
 define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind {
 ; AVX512F-LABEL: splatvar_funnnel_v8i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
 ; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512F-NEXT:    vpsrlq %xmm4, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm2
@@ -446,7 +446,7 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %
 ;
 ; AVX512BW-LABEL: splatvar_funnnel_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
 ; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512BW-NEXT:    vpsrlq %xmm4, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
@@ -628,7 +628,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31]
 ; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX512F-NEXT:    vpsrlw %xmm2, %ymm5, %ymm5
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512F-NEXT:    vpand %ymm6, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23]
 ; AVX512F-NEXT:    vpsrlw %xmm2, %ymm3, %ymm3
@@ -651,7 +651,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31]
 ; AVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm5, %ymm5
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512VL-NEXT:    vpand %ymm6, %ymm5, %ymm5
 ; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23]
 ; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm3, %ymm3
@@ -672,7 +672,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpsrlw %xmm2, %zmm3, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512BW-NEXT:    vpandq %zmm4, %zmm3, %zmm3
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
 ; AVX512BW-NEXT:    vpsrlw %xmm2, %zmm0, %zmm0
@@ -696,7 +696,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
 ; AVX512VLBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpsrlw %xmm2, %zmm3, %zmm3
-; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512VLBW-NEXT:    vpandq %zmm4, %zmm3, %zmm3
 ; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
 ; AVX512VLBW-NEXT:    vpsrlw %xmm2, %zmm0, %zmm0
@@ -911,7 +911,7 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
 ; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64]
 ; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512F-NEXT:    vpand %ymm5, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
@@ -960,7 +960,7 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
 ; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64]
 ; AVX512VL-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512VL-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512VL-NEXT:    vpand %ymm5, %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
@@ -984,7 +984,7 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
 ; AVX512BW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
 ; AVX512BW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
@@ -1006,7 +1006,7 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
 ; AVX512VLBW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
-; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm2, %zmm2
 ; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
 ; AVX512VLBW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0

diff  --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
index 04e4e66dd1b95..0dab0a466b179 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
@@ -87,7 +87,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind {
 ;
 ; AVX2-LABEL: var_funnnel_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [63,63]
 ; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpsrlvq %xmm3, %xmm0, %xmm3
 ; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
@@ -432,7 +432,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind {
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
 ; AVX512BW-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpor %xmm0, %xmm2, %xmm0
@@ -443,7 +443,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind {
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm2
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
 ; AVX512VLBW-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
 ; AVX512VLBW-NEXT:    vpsllvw %xmm1, %xmm0, %xmm0
 ; AVX512VLBW-NEXT:    vpor %xmm0, %xmm2, %xmm0
@@ -635,7 +635,7 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
 ; AVX512BW-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
 ; AVX512BW-NEXT:    vpsrlvw %zmm3, %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
 ; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
@@ -764,17 +764,29 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind
 ; SSE-NEXT:    por %xmm4, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: splatvar_funnnel_v2i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
-; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm3
-; AVX-NEXT:    vpsrlq %xmm3, %xmm0, %xmm3
-; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
-; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpor %xmm0, %xmm3, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: splatvar_funnnel_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpsrlq %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [63,63]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpsrlq %xmm3, %xmm0, %xmm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: splatvar_funnnel_v2i64:
 ; AVX512F:       # %bb.0:
@@ -1100,25 +1112,38 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
 ; SSE-NEXT:    packuswb %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: splatvar_funnnel_v16i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpand %xmm3, %xmm0, %xmm0
-; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: splatvar_funnnel_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_funnnel_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: splatvar_funnnel_v16i8:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; AVX512F-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
 ; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; AVX512F-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
@@ -1131,7 +1156,7 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
 ; AVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; AVX512VL-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
 ; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; AVX512VL-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
@@ -1144,7 +1169,7 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; AVX512BW-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
 ; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; AVX512BW-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0

diff  --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
index a62b2b70abdb1..755c098309088 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
@@ -334,7 +334,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX512BW-NEXT:    vpsubw %ymm1, %ymm3, %ymm1
 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpor %ymm0, %ymm2, %ymm0
@@ -344,7 +344,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
 ; AVX512VLBW-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm2
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX512VLBW-NEXT:    vpsubw %ymm1, %ymm3, %ymm1
 ; AVX512VLBW-NEXT:    vpsllvw %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    vpor %ymm0, %ymm2, %ymm0
@@ -514,7 +514,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
 ; AVX512BW-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
 ; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512BW-NEXT:    vpand %ymm4, %ymm2, %ymm2
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
@@ -530,7 +530,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
 ; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
 ; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512VLBW-NEXT:    vpsrlvw %ymm3, %ymm4, %ymm3
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512VLBW-NEXT:    vpand %ymm4, %ymm3, %ymm3
 ; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
 ; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -618,7 +618,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind
 ;
 ; AVX2-LABEL: splatvar_funnnel_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [63,63]
 ; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpsrlq %xmm3, %ymm0, %ymm3
 ; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
@@ -934,7 +934,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX2-NEXT:    vpsrlw %xmm1, %ymm2, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
@@ -947,7 +947,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX512F-NEXT:    vpsrlw %xmm1, %ymm2, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512F-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
@@ -960,7 +960,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind
 ; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpsrlw %xmm1, %ymm2, %ymm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512VL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
@@ -973,7 +973,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsrlw %xmm1, %ymm2, %ymm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm2
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512BW-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
@@ -986,7 +986,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind
 ; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512VLBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpsrlw %xmm1, %ymm2, %ymm2
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512VLBW-NEXT:    vpand %ymm3, %ymm2, %ymm2
 ; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512VLBW-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
@@ -1367,11 +1367,13 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind {
 ;
 ; AVX512BW-LABEL: constant_funnnel_v32i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
+; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vpsrlw $8, %ymm1, %ymm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
+; AVX512BW-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrlw $8, %ymm0, %ymm0
@@ -1391,11 +1393,13 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind {
 ;
 ; AVX512VBMI2-LABEL: constant_funnnel_v32i8:
 ; AVX512VBMI2:       # %bb.0:
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
+; AVX512VBMI2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
+; AVX512VBMI2-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512VBMI2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512VBMI2-NEXT:    vpsllvw %zmm1, %zmm2, %zmm1
 ; AVX512VBMI2-NEXT:    vpsrlw $8, %ymm1, %ymm1
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
+; AVX512VBMI2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
+; AVX512VBMI2-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512VBMI2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512VBMI2-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
 ; AVX512VBMI2-NEXT:    vpsrlw $8, %ymm0, %ymm0

diff  --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
index a1abdd9565af4..1f0cf8b7affa9 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
@@ -37,7 +37,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
 ; AVX512F-LABEL: var_funnnel_v32i16:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpxor %xmm4, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
@@ -66,7 +66,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
 ; AVX512VL-LABEL: var_funnnel_v32i16:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
@@ -96,7 +96,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX512BW-NEXT:    vpsubw %zmm1, %zmm3, %zmm1
 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
@@ -106,7 +106,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
 ; AVX512VLBW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm2
-; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX512VLBW-NEXT:    vpsubw %zmm1, %zmm3, %zmm1
 ; AVX512VLBW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vporq %zmm0, %zmm2, %zmm0
@@ -213,7 +213,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
 ; AVX512BW-NEXT:    vpsrlvw %zmm3, %zmm4, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512BW-NEXT:    vpandq %zmm4, %zmm3, %zmm3
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
@@ -229,7 +229,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
 ; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
 ; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
 ; AVX512VLBW-NEXT:    vpsrlvw %zmm3, %zmm4, %zmm3
-; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512VLBW-NEXT:    vpandq %zmm4, %zmm3, %zmm3
 ; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
 ; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
@@ -375,7 +375,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512F-NEXT:    vpsrlw %xmm1, %ymm3, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512F-NEXT:    vpand %ymm4, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512F-NEXT:    vpsrlw %xmm1, %ymm2, %ymm2
@@ -397,7 +397,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512VL-NEXT:    vpsrlw %xmm1, %ymm3, %ymm3
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512VL-NEXT:    vpand %ymm4, %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512VL-NEXT:    vpsrlw %xmm1, %ymm2, %ymm2
@@ -418,7 +418,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
 ; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
@@ -431,7 +431,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind
 ; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
 ; AVX512VLBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpsrlw %xmm1, %zmm2, %zmm2
-; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm2, %zmm2
 ; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
 ; AVX512VLBW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0

diff  --git a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
index 54acb196f275d..72a1422d2b9e0 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
@@ -338,7 +338,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
 ; AVX512F-LABEL: constant_funnnel_v2i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = <4,5,u,u>
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5]
 ; AVX512F-NEXT:    vprorvd %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512F-NEXT:    vzeroupper
@@ -352,7 +352,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
 ; AVX512BW-LABEL: constant_funnnel_v2i32:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = <4,5,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5]
 ; AVX512BW-NEXT:    vprorvd %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -366,7 +366,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
 ; AVX512VBMI2-LABEL: constant_funnnel_v2i32:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm1 = <4,5,u,u>
+; AVX512VBMI2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5]
 ; AVX512VBMI2-NEXT:    vprorvd %zmm1, %zmm0, %zmm0
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    vzeroupper

diff  --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
index 53e450856632d..511203ced00a8 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
@@ -222,7 +222,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
 ; AVX2NOBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX2NOBW-NEXT:    vpsrlw $2, %xmm0, %xmm1
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX2NOBW-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
 ; AVX2NOBW-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; AVX2NOBW-NEXT:    vpsrlw $7, %xmm0, %xmm0
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -240,7 +240,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX512BW-NEXT:    vpsrlw $2, %xmm0, %xmm1
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
 ; AVX512BW-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsrlw $7, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -619,7 +619,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ; AVX2NOBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm1
 ; AVX2NOBW-NEXT:    vpsrlw $2, %xmm1, %xmm2
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX2NOBW-NEXT:    vpbroadcastb {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
 ; AVX2NOBW-NEXT:    vpxor %xmm3, %xmm2, %xmm2
 ; AVX2NOBW-NEXT:    vpsrlw $7, %xmm1, %xmm1
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
@@ -641,7 +641,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpaddb %xmm0, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsrlw $2, %xmm1, %xmm2
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
 ; AVX512BW-NEXT:    vpxor %xmm3, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpsrlw $7, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1

diff  --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
index 279f3c464411c..549c6814d3028 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
@@ -201,7 +201,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
 ; AVX2NOBW:       # %bb.0:
 ; AVX2NOBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX2NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
 ; AVX2NOBW-NEXT:    vpmulhw %ymm3, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
@@ -211,7 +211,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
 ; AVX2NOBW-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX2NOBW-NEXT:    vpsrlw $2, %ymm0, %ymm1
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX2NOBW-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
 ; AVX2NOBW-NEXT:    vpxor %ymm2, %ymm1, %ymm1
 ; AVX2NOBW-NEXT:    vpsrlw $7, %ymm0, %ymm0
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -228,7 +228,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512BW-NEXT:    vpsrlw $2, %ymm0, %ymm1
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
 ; AVX512BW-NEXT:    vpxor %ymm2, %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpsrlw $7, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -595,7 +595,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
 ; AVX2NOBW:       # %bb.0:
 ; AVX2NOBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX2NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
 ; AVX2NOBW-NEXT:    vpmulhw %ymm3, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
@@ -605,7 +605,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
 ; AVX2NOBW-NEXT:    vpaddb %ymm0, %ymm1, %ymm1
 ; AVX2NOBW-NEXT:    vpsrlw $2, %ymm1, %ymm2
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX2NOBW-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
 ; AVX2NOBW-NEXT:    vpxor %ymm3, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpsrlw $7, %ymm1, %ymm1
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
@@ -626,7 +626,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpaddb %ymm0, %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpsrlw $2, %ymm1, %ymm2
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
 ; AVX512BW-NEXT:    vpxor %ymm3, %ymm2, %ymm2
 ; AVX512BW-NEXT:    vpsrlw $7, %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
@@ -739,7 +739,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
 ; AVX2NOBW-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX2NOBW-NEXT:    vpand %ymm3, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1

diff  --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
index 04761b6d35c5f..ac974bb51f54f 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
@@ -103,7 +103,7 @@ define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind {
 ; AVX512F-LABEL: test_div7_32i16:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725]
 ; AVX512F-NEXT:    vpmulhw %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsrlw $15, %ymm1, %ymm3
 ; AVX512F-NEXT:    vpsraw $1, %ymm1, %ymm1
@@ -132,7 +132,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
 ; AVX512F-NEXT:    vpmulhw %ymm4, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
@@ -141,12 +141,12 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vpackuswb %ymm3, %ymm5, %ymm3
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm3, %ymm1
 ; AVX512F-NEXT:    vpsrlw $7, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX512F-NEXT:    vpand %ymm5, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpsrlw $2, %ymm1, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
 ; AVX512F-NEXT:    vpand %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
 ; AVX512F-NEXT:    vpxor %ymm7, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsubb %ymm7, %ymm1, %ymm1
@@ -172,7 +172,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
 ; AVX512BW-NEXT:    vpmulhw %zmm3, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
@@ -181,7 +181,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpsrlw $2, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
 ; AVX512BW-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vpsrlw $7, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
@@ -221,7 +221,7 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpackuswb %ymm4, %ymm5, %ymm4
 ; AVX512F-NEXT:    vpsrlw $7, %ymm1, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX512F-NEXT:    vpand %ymm5, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm4, %ymm1
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
@@ -411,7 +411,7 @@ define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind {
 ; AVX512F-LABEL: test_rem7_32i16:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725]
 ; AVX512F-NEXT:    vpmulhw %ymm2, %ymm1, %ymm3
 ; AVX512F-NEXT:    vpsrlw $15, %ymm3, %ymm4
 ; AVX512F-NEXT:    vpsraw $1, %ymm3, %ymm3
@@ -448,7 +448,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
 ; AVX512F-NEXT:    vpmulhw %ymm4, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
@@ -457,17 +457,17 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vpackuswb %ymm3, %ymm5, %ymm3
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpsrlw $7, %ymm3, %ymm5
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX512F-NEXT:    vpand %ymm6, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpsrlw $2, %ymm3, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
 ; AVX512F-NEXT:    vpand %ymm7, %ymm3, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm8 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
 ; AVX512F-NEXT:    vpxor %ymm3, %ymm8, %ymm3
 ; AVX512F-NEXT:    vpaddb %ymm5, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpsubb %ymm8, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpsllw $3, %ymm3, %ymm5
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm9 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm9 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
 ; AVX512F-NEXT:    vpand %ymm5, %ymm9, %ymm5
 ; AVX512F-NEXT:    vpsubb %ymm5, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
@@ -497,7 +497,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
 ; AVX512BW-NEXT:    vpmulhw %zmm3, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
@@ -506,7 +506,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpsrlw $2, %zmm1, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
 ; AVX512BW-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $7, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
@@ -550,12 +550,12 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vpsrlw $8, %ymm6, %ymm6
 ; AVX512F-NEXT:    vpackuswb %ymm5, %ymm6, %ymm5
 ; AVX512F-NEXT:    vpsrlw $7, %ymm3, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX512F-NEXT:    vpand %ymm6, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpaddb %ymm3, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm7
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512F-NEXT:    vpand %ymm3, %ymm7, %ymm7
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5
@@ -619,7 +619,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpaddb %zmm1, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
 ; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512BW-NEXT:    vpandq %zmm3, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
 ; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1

diff  --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
index 805dd422ac491..b38de74f85eca 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
@@ -202,7 +202,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
 ; AVX2NOBW:       # %bb.0:
 ; AVX2NOBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX2NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
 ; AVX2NOBW-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
@@ -624,7 +624,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
 ; AVX2NOBW:       # %bb.0:
 ; AVX2NOBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX2NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
 ; AVX2NOBW-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
@@ -777,7 +777,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
 ; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm3, %ymm3
 ; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
-; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX2NOBW-NEXT:    vpand %ymm4, %ymm3, %ymm3
 ; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
 ; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1

diff  --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
index 5169dd69f39fc..c64344396990a 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
@@ -103,7 +103,7 @@ define <16 x i32> @test_div7_16i32(<16 x i32> %a) nounwind {
 define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind {
 ; AVX512F-LABEL: test_div7_32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
 ; AVX512F-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpsubw %ymm2, %ymm0, %ymm3
 ; AVX512F-NEXT:    vpsrlw $1, %ymm3, %ymm3
@@ -135,7 +135,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
 ; AVX512F-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
@@ -144,7 +144,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vpackuswb %ymm2, %ymm4, %ymm2
 ; AVX512F-NEXT:    vpsubb %ymm2, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpsrlw $1, %ymm4, %ymm4
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
 ; AVX512F-NEXT:    vpand %ymm5, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpaddb %ymm2, %ymm4, %ymm2
 ; AVX512F-NEXT:    vpsrlw $2, %ymm2, %ymm2
@@ -169,7 +169,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
 ; AVX512BW-NEXT:    vpmullw %zmm3, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
@@ -431,7 +431,7 @@ define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind {
 ; AVX512F-LABEL: test_rem7_32i16:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
 ; AVX512F-NEXT:    vpmulhuw %ymm2, %ymm1, %ymm3
 ; AVX512F-NEXT:    vpsubw %ymm3, %ymm1, %ymm4
 ; AVX512F-NEXT:    vpsrlw $1, %ymm4, %ymm4
@@ -471,7 +471,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
 ; AVX512F-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
@@ -480,14 +480,14 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vpackuswb %ymm3, %ymm5, %ymm3
 ; AVX512F-NEXT:    vpsubb %ymm3, %ymm1, %ymm5
 ; AVX512F-NEXT:    vpsrlw $1, %ymm5, %ymm5
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
 ; AVX512F-NEXT:    vpand %ymm6, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpaddb %ymm3, %ymm5, %ymm3
 ; AVX512F-NEXT:    vpsrlw $2, %ymm3, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
 ; AVX512F-NEXT:    vpand %ymm5, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpsllw $3, %ymm3, %ymm7
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm8 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
 ; AVX512F-NEXT:    vpand %ymm7, %ymm8, %ymm7
 ; AVX512F-NEXT:    vpsubb %ymm7, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
@@ -515,7 +515,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
 ; AVX512BW-NEXT:    vpmullw %zmm3, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
@@ -570,7 +570,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
 ; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm5
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[16],ymm1[16],ymm4[17],ymm1[17],ymm4[18],ymm1[18],ymm4[19],ymm1[19],ymm4[20],ymm1[20],ymm4[21],ymm1[21],ymm4[22],ymm1[22],ymm4[23],ymm1[23]
 ; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
@@ -641,7 +641,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm3, %zmm3
 ; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512BW-NEXT:    vpandq %zmm4, %zmm3, %zmm3
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm1[0],zmm2[1],zmm1[1],zmm2[2],zmm1[2],zmm2[3],zmm1[3],zmm2[4],zmm1[4],zmm2[5],zmm1[5],zmm2[6],zmm1[6],zmm2[7],zmm1[7],zmm2[16],zmm1[16],zmm2[17],zmm1[17],zmm2[18],zmm1[18],zmm2[19],zmm1[19],zmm2[20],zmm1[20],zmm2[21],zmm1[21],zmm2[22],zmm1[22],zmm2[23],zmm1[23],zmm2[32],zmm1[32],zmm2[33],zmm1[33],zmm2[34],zmm1[34],zmm2[35],zmm1[35],zmm2[36],zmm1[36],zmm2[37],zmm1[37],zmm2[38],zmm1[38],zmm2[39],zmm1[39],zmm2[48],zmm1[48],zmm2[49],zmm1[49],zmm2[50],zmm1[50],zmm2[51],zmm1[51],zmm2[52],zmm1[52],zmm2[53],zmm1[53],zmm2[54],zmm1[54],zmm2[55],zmm1[55]
 ; AVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
index 67370c65bc603..6dda7005c8fc0 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
@@ -264,11 +264,11 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512BW-SLOW-LABEL: load_i16_stride3_vf4:
 ; AVX512BW-SLOW:       # %bb.0:
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,3,6,9,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [0,3,6,9,0,3,6,9]
 ; AVX512BW-SLOW-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512BW-SLOW-NEXT:    vmovdqa 16(%rdi), %xmm2
 ; AVX512BW-SLOW-NEXT:    vpermi2w %xmm2, %xmm1, %xmm0
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,4,7,10,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,4,7,10,1,4,7,10]
 ; AVX512BW-SLOW-NEXT:    vpermi2w %xmm2, %xmm1, %xmm3
 ; AVX512BW-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
 ; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
@@ -281,13 +281,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512BW-FAST-LABEL: load_i16_stride3_vf4:
 ; AVX512BW-FAST:       # %bb.0:
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,3,6,9,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [0,3,6,9,0,3,6,9]
 ; AVX512BW-FAST-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512BW-FAST-NEXT:    vmovdqa 16(%rdi), %xmm2
 ; AVX512BW-FAST-NEXT:    vpermi2w %xmm2, %xmm1, %xmm0
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,4,7,10,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,4,7,10,1,4,7,10]
 ; AVX512BW-FAST-NEXT:    vpermi2w %xmm2, %xmm1, %xmm3
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = <2,5,8,11,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [2,5,8,11,2,5,8,11]
 ; AVX512BW-FAST-NEXT:    vpermi2w %xmm2, %xmm1, %xmm4
 ; AVX512BW-FAST-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512BW-FAST-NEXT:    vmovq %xmm3, (%rdx)

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
index 2c8d6573cdd14..722065453963e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
@@ -856,7 +856,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,2,2,3,0,2,4,6]
 ; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm3, %ymm4
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29]
 ; AVX2-FAST-NEXT:    vpshufb %ymm5, %ymm4, %ymm6
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm3, %ymm7
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u>
@@ -876,7 +876,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshufb %xmm12, %xmm3, %xmm12
 ; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm12, %ymm4, %ymm4
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
 ; AVX2-FAST-NEXT:    vpshufb %ymm13, %ymm7, %ymm7
@@ -1086,7 +1086,8 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512F-FAST-LABEL: load_i16_stride4_vf16:
 ; AVX512F-FAST:       # %bb.0:
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u>
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
+; AVX512F-FAST-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6]
 ; AVX512F-FAST-NEXT:    vmovdqa 96(%rdi), %ymm2
 ; AVX512F-FAST-NEXT:    vpermd %ymm2, %ymm1, %ymm3
@@ -1100,7 +1101,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vmovdqa64 (%rdi), %zmm4
 ; AVX512F-FAST-NEXT:    vpmovqw %zmm4, %xmm9
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
 ; AVX512F-FAST-NEXT:    vpshufb %ymm10, %ymm1, %ymm1
@@ -1903,7 +1904,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,2,3,0,2,4,6]
 ; AVX2-FAST-NEXT:    vpermd %ymm4, %ymm2, %ymm7
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29]
 ; AVX2-FAST-NEXT:    vpshufb %ymm4, %ymm7, %ymm3
 ; AVX2-FAST-NEXT:    vmovdqa %ymm4, %ymm8
 ; AVX2-FAST-NEXT:    vpermd %ymm10, %ymm2, %ymm12
@@ -1939,7 +1940,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm11, %xmm10
 ; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm0, %ymm7, %ymm6
 ; AVX2-FAST-NEXT:    vmovdqa %ymm0, %ymm10
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
@@ -1966,7 +1967,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,3,2,3,1,3,5,7]
 ; AVX2-FAST-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
 ; AVX2-FAST-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29]
 ; AVX2-FAST-NEXT:    vpshufb %ymm10, %ymm2, %ymm0
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm4 = ymm8[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
@@ -2000,7 +2001,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm10, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
 ; AVX2-FAST-NEXT:    vpshufb %ymm11, %ymm8, %ymm8
@@ -2398,7 +2399,8 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST:       # %bb.0:
 ; AVX512F-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512F-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm1
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u>
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
+; AVX512F-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,2,3,0,2,4,6]
 ; AVX512F-FAST-NEXT:    vmovdqa 224(%rdi), %ymm6
 ; AVX512F-FAST-NEXT:    vpermd %ymm6, %ymm4, %ymm5
@@ -2422,7 +2424,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpmovqw %zmm0, %xmm13
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm4[0,1,2,3],zmm7[4,5,6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm4, %ymm5, %ymm13
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
 ; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm9, %ymm9
@@ -4116,7 +4118,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,2,3,0,2,4,6]
 ; AVX2-FAST-NEXT:    vpermd %ymm4, %ymm2, %ymm3
 ; AVX2-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29]
 ; AVX2-FAST-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
 ; AVX2-FAST-NEXT:    vmovdqa %ymm4, %ymm7
 ; AVX2-FAST-NEXT:    vpermd %ymm5, %ymm2, %ymm4
@@ -4194,7 +4196,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm13
 ; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm4, %ymm8, %ymm8
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
 ; AVX2-FAST-NEXT:    vpshufb %ymm7, %ymm9, %ymm9
@@ -4262,7 +4264,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
 ; AVX2-FAST-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29]
 ; AVX2-FAST-NEXT:    vpshufb %ymm12, %ymm2, %ymm3
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u>
 ; AVX2-FAST-NEXT:    vpshufb %ymm13, %ymm4, %ymm5
@@ -4356,7 +4358,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm9, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
@@ -5221,7 +5223,8 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm26
 ; AVX512F-FAST-NEXT:    vmovdqa64 (%rdi), %zmm30
 ; AVX512F-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm4
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u>
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
+; AVX512F-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6]
 ; AVX512F-FAST-NEXT:    vmovdqa64 224(%rdi), %ymm24
 ; AVX512F-FAST-NEXT:    vpermd %ymm24, %ymm1, %ymm10
@@ -5265,7 +5268,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpmovqw %zmm23, %xmm14
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm14[0,1,2,3],zmm9[4,5,6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm9, %ymm10, %ymm14
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
 ; AVX512F-FAST-NEXT:    vpshufb %ymm10, %ymm11, %ymm11

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
index 6b0d0a9e7662f..888dbf5da722f 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
@@ -448,22 +448,22 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512BW-LABEL: load_i16_stride5_vf4:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm0 = <1,6,11,u,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [1,6,11,0,1,6,11,0]
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm1
 ; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,5,10,u,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [0,5,10,0,0,5,10,0]
 ; AVX512BW-NEXT:    vpermw %zmm1, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm2
 ; AVX512BW-NEXT:    vpextrw $7, %xmm2, %eax
 ; AVX512BW-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpinsrw $3, 32(%rdi), %xmm0, %xmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,7,12,17,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [2,7,12,17,2,7,12,17]
 ; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm3
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm4
 ; AVX512BW-NEXT:    vpermi2w %ymm3, %ymm4, %ymm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm5 = <3,8,13,18,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [3,8,13,18,3,8,13,18]
 ; AVX512BW-NEXT:    vpermi2w %ymm3, %ymm4, %ymm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = <4,9,14,19,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [4,9,14,19,4,9,14,19]
 ; AVX512BW-NEXT:    vpermi2w %ymm3, %ymm4, %ymm6
 ; AVX512BW-NEXT:    vmovq %xmm1, (%rsi)
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rdx)
@@ -1365,9 +1365,10 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19]
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm8, %ymm5, %ymm6, %ymm5
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,0,3,5,u>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [0,3,5,0,0,3,5,0]
+; AVX2-FAST-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm6, %ymm6
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
 ; AVX2-FAST-NEXT:    vpshufb %ymm7, %ymm6, %ymm6
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
@@ -1380,9 +1381,10 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpermd %ymm9, %ymm10, %ymm9
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm8, %ymm6, %ymm9, %ymm6
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,1,3,6,u>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [1,3,6,0,1,3,6,0]
+; AVX2-FAST-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm8, %ymm9
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
 ; AVX2-FAST-NEXT:    vpshufb %ymm8, %ymm9, %ymm9
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm9 = ymm6[0,1,2,3,4],ymm9[5,6,7],ymm6[8,9,10,11,12],ymm9[13,14,15]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7]
@@ -1425,7 +1427,8 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,0,2,5,7>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,2,5,7,0,2,5,7]
+; AVX2-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
@@ -1627,10 +1630,11 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7]
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpor %ymm4, %ymm5, %ymm5
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,0,3,5,u>
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [0,3,5,0,0,3,5,0]
+; AVX512F-FAST-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vmovdqa 128(%rdi), %ymm4
 ; AVX512F-FAST-NEXT:    vpermd %ymm4, %ymm6, %ymm6
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm7, %ymm6, %ymm6
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
@@ -1643,9 +1647,10 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7]
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpor %ymm6, %ymm8, %ymm6
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,1,3,6,u>
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [1,3,6,0,1,3,6,0]
+; AVX512F-FAST-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vpermd %ymm4, %ymm8, %ymm9
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm8, %ymm9, %ymm9
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm9 = ymm6[0,1,2,3,4],ymm9[5,6,7],ymm6[8,9,10,11,12],ymm9[13,14,15]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7]
@@ -1688,7 +1693,8 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,0,2,5,7>
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,2,5,7,0,2,5,7]
+; AVX512F-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vpermd %ymm4, %ymm1, %ymm1
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,u,u,u,u,24,25,30,31,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,12,14]
@@ -2731,7 +2737,8 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm8[1],ymm5[2,3],ymm8[4],ymm5[5],ymm8[6],ymm5[7,8],ymm8[9],ymm5[10,11],ymm8[12],ymm5[13],ymm8[14],ymm5[15]
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm4, %xmm5
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15]
+; AVX2-SLOW-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
@@ -2846,7 +2853,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [0,3,1,3,0,3,5,7]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm8, %ymm11, %ymm12, %ymm12
 ; AVX2-FAST-NEXT:    vpermd %ymm13, %ymm14, %ymm8
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm15 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
 ; AVX2-FAST-NEXT:    vpshufb %ymm15, %ymm8, %ymm8
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm8[4,5,6,7]
@@ -2859,7 +2866,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [1,3,2,3,1,3,6,7]
 ; AVX2-FAST-NEXT:    vpermd %ymm13, %ymm9, %ymm10
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
 ; AVX2-FAST-NEXT:    vpshufb %ymm1, %ymm10, %ymm10
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm10 = ymm0[0,1,2,3,4],ymm10[5,6,7],ymm0[8,9,10,11,12],ymm10[13,14,15]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
@@ -2923,7 +2930,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [2,4,7,0,2,4,7,0]
 ; AVX2-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm13, %ymm2, %ymm9
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
 ; AVX2-FAST-NEXT:    vpshufb %ymm6, %ymm9, %ymm9
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm9 = ymm1[0,1,2,3,4],ymm9[5,6,7],ymm1[8,9,10,11,12],ymm9[13,14,15]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
@@ -2949,14 +2956,15 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <1,3,u,u,6,0,3,5>
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7]
+; AVX2-FAST-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,1,3,0,2,5,7]
 ; AVX2-FAST-NEXT:    vpermd %ymm13, %ymm1, %ymm6
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm7, %ymm6, %ymm6
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7]
 ; AVX2-FAST-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
@@ -3168,7 +3176,8 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15]
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm4, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15]
+; AVX2-FAST-PERLANE-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm5, %ymm1, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
@@ -5704,7 +5713,8 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15]
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm1, %xmm3
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15]
+; AVX2-SLOW-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
@@ -5938,7 +5948,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqa 288(%rdi), %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,3,1,3,0,3,5,7]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
 ; AVX2-FAST-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm3[5,6,7],ymm8[8,9,10,11,12],ymm3[13,14,15]
@@ -5968,7 +5978,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [1,3,2,3,1,3,6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm6, %ymm7
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm7, %ymm7
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm7 = ymm2[0,1,2,3,4],ymm7[5,6,7],ymm2[8,9,10,11,12],ymm7[13,14,15]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7]
@@ -6077,7 +6087,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [2,4,7,0,2,4,7,0]
 ; AVX2-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
 ; AVX2-FAST-NEXT:    vpshufb %ymm5, %ymm10, %ymm10
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm10 = ymm6[0,1,2,3,4],ymm10[5,6,7],ymm6[8,9,10,11,12],ymm10[13,14,15]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7]
@@ -6143,12 +6153,13 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <1,3,u,u,6,0,3,5>
 ; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm7, %ymm2
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7]
+; AVX2-FAST-NEXT:    # ymm13 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm13, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,2,1,3,0,2,5,7]
 ; AVX2-FAST-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm2 # 32-byte Folded Reload
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm14[0],ymm9[1],ymm14[2,3],ymm9[4],ymm14[5],ymm9[6],ymm14[7,8],ymm9[9],ymm14[10,11],ymm9[12],ymm14[13],ymm9[14],ymm14[15]
@@ -6604,7 +6615,8 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    # ymm1 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5],ymm11[6],mem[7,8],ymm11[9],mem[10,11],ymm11[12],mem[13],ymm11[14],mem[15]
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm1, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15]
+; AVX2-FAST-PERLANE-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm9 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
@@ -7097,7 +7109,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3]
 ; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm4, %ymm5, %ymm5
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = <8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [8,9,2,3,12,13,0,0,8,9,2,3,12,13,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3,4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
@@ -7295,7 +7307,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vporq %ymm3, %ymm0, %ymm19
 ; AVX512F-FAST-NEXT:    vpsrlq $48, %xmm13, %xmm0
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm13, %xmm17
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2,3,12,13,2,3,12,13,2,3,12,13,2,3,12,13]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm16, %xmm11
 ; AVX512F-FAST-NEXT:    vpshufb %xmm3, %xmm11, %xmm9
 ; AVX512F-FAST-NEXT:    vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
@@ -7366,7 +7378,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm5, %ymm22
 ; AVX512F-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [4,5,14,15,4,5,14,15,4,5,14,15,4,5,14,15]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm2, %xmm11, %xmm1
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm11, %xmm30
 ; AVX512F-FAST-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm17[2],xmm1[3],xmm17[3]
@@ -7445,7 +7457,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm30, %xmm7
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm17, %xmm13
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm13[0],xmm7[1],xmm13[2,3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [6,7,0,1,10,11,0,0,6,7,0,1,10,11,0,0]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm1
 ; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm17 = <1,3,6,0,5,u,u,u>
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm16, %ymm9
@@ -7477,7 +7489,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm25
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm13[0,1],xmm7[2],xmm13[3]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm14[0],ymm9[1,2],ymm14[3],ymm9[4],ymm14[5],ymm9[6,7],ymm14[8],ymm9[9,10],ymm14[11],ymm9[12],ymm14[13],ymm9[14,15]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [8,9,2,3,12,13,0,0,8,9,2,3,12,13,0,0]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm0, %xmm1, %xmm3
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <1,4,6,3,6,u,u,u>
 ; AVX512F-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
index 59ff5b4f8f45b..e7ab8a895b2c8 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
@@ -240,7 +240,7 @@ define void @load_i16_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FAST-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
 ; AVX512BW-FAST-NEXT:    vpbroadcastw 4(%rdi), %xmm4
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <3,9,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [3,9,3,9,3,9,3,9]
 ; AVX512BW-FAST-NEXT:    vpermi2w %xmm1, %xmm0, %xmm5
 ; AVX512BW-FAST-NEXT:    vpbroadcastw 20(%rdi), %xmm6
 ; AVX512BW-FAST-NEXT:    vpbroadcastw 8(%rdi), %xmm7
@@ -528,19 +528,19 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-LABEL: load_i16_stride6_vf4:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,6,12,18,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [0,6,12,18,0,6,12,18]
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm1
 ; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,7,13,19,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,7,13,19,1,7,13,19]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = <2,8,14,20,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [2,8,14,20,2,8,14,20]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm4
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm5 = <3,9,15,21,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [3,9,15,21,3,9,15,21]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = <4,10,16,22,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [4,10,16,22,4,10,16,22]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm6
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm7 = <5,11,17,23,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [5,11,17,23,5,11,17,23]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm7
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512BW-NEXT:    vmovq %xmm3, (%rdx)
@@ -3362,7 +3362,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm8 = ymm12[0,1,2],ymm8[3,4,5,6,7],ymm12[8,9,10],ymm8[11,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7]
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm12, %xmm7, %xmm7
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm13 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
 ; AVX2-SLOW-NEXT:    vpshufb %xmm13, %xmm3, %xmm3
@@ -3480,7 +3480,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm1[0,1,0,2,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm6, %xmm3, %xmm7
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4],xmm4[5],xmm7[6,7]
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
@@ -3498,7 +3498,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
 ; AVX2-SLOW-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
 ; AVX2-SLOW-NEXT:    # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm9, %xmm3, %xmm3
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3]
@@ -3580,7 +3580,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshufb %ymm9, %ymm13, %ymm9
 ; AVX2-FAST-NEXT:    vpblendvb %ymm0, %ymm10, %ymm9, %ymm4
 ; AVX2-FAST-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm14 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
 ; AVX2-FAST-NEXT:    vpshufb %xmm14, %xmm3, %xmm3
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm15 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
 ; AVX2-FAST-NEXT:    vpshufb %xmm15, %xmm2, %xmm2
@@ -3600,7 +3600,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshufb %xmm13, %xmm4, %xmm1
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm4, %xmm2
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm10 = xmm2[2,1,0,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm15 = <u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm15 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0]
 ; AVX2-FAST-NEXT:    vpshufb %xmm15, %xmm10, %xmm3
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm5[0],ymm8[1],ymm5[2,3,4,5],ymm8[6],ymm5[7]
@@ -3611,7 +3611,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqa 320(%rdi), %ymm11
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
 ; AVX2-FAST-NEXT:    vpshufb %xmm9, %xmm2, %xmm6
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5>
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm2, %xmm5
@@ -3648,7 +3648,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
 ; AVX2-FAST-NEXT:    vpshufb %xmm13, %xmm4, %xmm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0]
 ; AVX2-FAST-NEXT:    vpshufb %xmm4, %xmm10, %xmm10
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm10[2],xmm3[3],xmm10[4,5],xmm3[6,7]
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23>
@@ -3692,7 +3692,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm9, %xmm10
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1]
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
 ; AVX2-FAST-NEXT:    vpshufb %xmm12, %xmm10, %xmm13
 ; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm9[2,1,2,0,4,5,6,7]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm13 = xmm15[0],xmm13[1,2],xmm15[3],xmm13[4,5,6,7]
@@ -3732,7 +3732,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshufb %xmm12, %xmm5, %xmm5
 ; AVX2-FAST-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4],xmm5[5,6],xmm2[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm10, %xmm10
 ; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,1,4,5,6,7]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1,2],xmm9[3],xmm10[4,5,6,7]
@@ -3758,7 +3758,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5,6],ymm6[7]
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm1, %xmm3
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
 ; AVX2-FAST-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u>
 ; AVX2-FAST-NEXT:    vpshufb %xmm6, %xmm1, %xmm8
@@ -3777,7 +3777,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX2-FAST-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
 ; AVX2-FAST-NEXT:    # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
 ; AVX2-FAST-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u>
 ; AVX2-FAST-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
@@ -3858,7 +3858,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm9, %ymm13, %ymm9
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm0, %ymm10, %ymm9, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} xmm14 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm14, %xmm3, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm15 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm15, %xmm2, %xmm2
@@ -3878,7 +3878,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm13, %xmm4, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm4, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm10 = xmm2[2,1,0,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm15 = <u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm15 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm15, %xmm10, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm7 = ymm5[0],ymm8[1],ymm5[2,3,4,5],ymm8[6],ymm5[7]
@@ -3889,7 +3889,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 320(%rdi), %ymm11
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm9, %xmm2, %xmm6
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm8 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5>
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm2, %xmm5
@@ -3926,7 +3926,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm13 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm13, %xmm4, %xmm3
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm4, %xmm10, %xmm10
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm10[2],xmm3[3],xmm10[4,5],xmm3[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23>
@@ -3970,7 +3970,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm9, %xmm10
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm12, %xmm10, %xmm13
 ; AVX2-FAST-PERLANE-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm9[2,1,2,0,4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm13 = xmm15[0],xmm13[1,2],xmm15[3],xmm13[4,5,6,7]
@@ -4010,7 +4010,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm12, %xmm5, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4],xmm5[5,6],xmm2[7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm5, %xmm10, %xmm10
 ; AVX2-FAST-PERLANE-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,1,4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1,2],xmm9[3],xmm10[4,5,6,7]
@@ -4036,7 +4036,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5,6],ymm6[7]
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm1, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm6 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u>
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm6, %xmm1, %xmm8
@@ -4055,7 +4055,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u>
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
@@ -4159,7 +4159,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,6]
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %ymm10, %ymm30
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u>
+; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %xmm10, %xmm7, %xmm7
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm12 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %xmm12, %xmm4, %xmm4
@@ -4294,7 +4294,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,1]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[0,1,0,2,4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
+; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %xmm8, %xmm7, %xmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
@@ -4334,7 +4334,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3],xmm6[4],xmm1[5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
+; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %xmm6, %xmm7, %xmm7
 ; AVX512F-ONLY-SLOW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,3]
@@ -4387,7 +4387,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm2, %ymm17
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm1, %ymm20
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm0, %xmm4, %xmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0]
 ; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm4, %xmm3
 ; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} xmm9 = xmm3[2,1,0,3]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm2, %xmm9, %xmm3
@@ -4396,7 +4396,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdi), %ymm12
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rdi), %ymm15
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm8, %xmm3, %xmm5
 ; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm3, %xmm7
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5>
@@ -4446,7 +4446,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm10, %ymm29
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm10, %xmm4, %xmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm12, %xmm9, %xmm9
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm9[2],xmm4[3],xmm9[4,5],xmm4[6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7>
@@ -4573,7 +4573,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7]
 ; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm1, %xmm8
 ; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm5, %xmm8, %xmm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u>
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm10, %xmm1, %xmm9
@@ -4608,7 +4608,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm10 = ymm12[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4],ymm7[5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm10 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm10, %xmm11, %xmm11
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm15 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm15, %xmm9, %xmm9
@@ -4618,7 +4618,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm10, %xmm6, %xmm6
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm15, %xmm4, %xmm4
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3],xmm6[4],xmm4[5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm6, %xmm8, %xmm8
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u>
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
@@ -4725,7 +4725,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-SLOW-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm9 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u>
+; AVX512DQ-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %xmm0, %xmm10, %xmm8
 ; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm10 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
 ; AVX512DQ-SLOW-NEXT:    vpshufb %xmm10, %xmm4, %xmm4
@@ -4856,7 +4856,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
 ; AVX512DQ-SLOW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[0,1,0,2,4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6]
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
+; AVX512DQ-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %xmm8, %xmm6, %xmm4
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6,7]
 ; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
@@ -4896,7 +4896,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
 ; AVX512DQ-SLOW-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3],xmm5[4],xmm0[5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
+; AVX512DQ-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %xmm5, %xmm6, %xmm6
 ; AVX512DQ-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3]
@@ -4943,7 +4943,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm2, %ymm20
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm1, %ymm21
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm0, %xmm5, %xmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0]
 ; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm5, %xmm3
 ; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} xmm14 = xmm3[2,1,0,3]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm2, %xmm14, %xmm3
@@ -4952,7 +4952,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %ymm7
 ; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rdi), %ymm13
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u>
+; AVX512DQ-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm11 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm11, %xmm15, %xmm4
 ; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm15, %xmm10
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5>
@@ -5005,7 +5005,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm8, %xmm14, %xmm14
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm14[2],xmm5[3],xmm14[4,5],xmm5[6,7]
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm14 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7>
@@ -5129,7 +5129,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
 ; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm6
 ; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
+; AVX512DQ-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm5, %xmm6, %xmm2
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u>
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm7, %xmm0, %xmm8
@@ -5164,7 +5164,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm14 = ymm13[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4],ymm7[5,6,7]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm7
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm14 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm14, %xmm10, %xmm10
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm11, %xmm8, %xmm8
@@ -5174,7 +5174,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm14, %xmm15, %xmm8
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm11, %xmm3, %xmm3
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3],xmm8[4],xmm3[5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
+; AVX512DQ-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm8, %xmm6, %xmm6
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u>
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
@@ -7725,7 +7725,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vpshufb %xmm15, %xmm4, %xmm3
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7]
@@ -7968,7 +7968,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm6, %xmm5, %xmm8
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm8[4],xmm3[5],xmm8[6,7]
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
@@ -8010,7 +8010,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX2-SLOW-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
 ; AVX2-SLOW-NEXT:    # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
@@ -8166,7 +8166,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshufb %ymm11, %ymm1, %ymm11
 ; AVX2-FAST-NEXT:    vpblendvb %ymm7, %ymm12, %ymm11, %ymm11
 ; AVX2-FAST-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm11 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
 ; AVX2-FAST-NEXT:    vpshufb %xmm11, %xmm9, %xmm12
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
 ; AVX2-FAST-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
@@ -8203,7 +8203,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15>
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm7, %xmm0
 ; AVX2-FAST-NEXT:    vmovdqa %xmm2, %xmm11
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0]
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm1, %xmm2
 ; AVX2-FAST-NEXT:    vmovdqa %xmm3, %xmm8
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7]
@@ -8220,7 +8220,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
 ; AVX2-FAST-NEXT:    vpshufb %xmm9, %xmm1, %xmm5
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5>
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm1, %xmm1
@@ -8319,7 +8319,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm9[4,5,6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm9, %xmm0, %xmm15
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm15[2],xmm7[3],xmm15[4,5],xmm7[6,7]
@@ -8401,7 +8401,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm1, %xmm0
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,1]
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
 ; AVX2-FAST-NEXT:    vpshufb %xmm0, %xmm7, %xmm5
 ; AVX2-FAST-NEXT:    vmovdqa %xmm0, %xmm10
 ; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm4[2,1,2,0,4,5,6,7]
@@ -8511,7 +8511,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
 ; AVX2-FAST-NEXT:    # xmm9 = mem[0,1,2,3,7,5,6,5]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm9[4],xmm1[5,6],xmm9[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
 ; AVX2-FAST-NEXT:    vpshufb %xmm9, %xmm7, %xmm7
 ; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm7[1,2],xmm4[3],xmm7[4,5,6,7]
@@ -8571,7 +8571,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm5
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
 ; AVX2-FAST-NEXT:    vpshufb %xmm7, %xmm5, %xmm3
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u>
 ; AVX2-FAST-NEXT:    vpshufb %xmm8, %xmm2, %xmm9
@@ -8612,7 +8612,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
 ; AVX2-FAST-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
 ; AVX2-FAST-NEXT:    # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
 ; AVX2-FAST-NEXT:    vpshufb %xmm8, %xmm5, %xmm5
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u>
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
@@ -8766,7 +8766,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm11, %ymm1, %ymm11
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm7, %ymm12, %ymm11, %ymm11
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} xmm11 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm11, %xmm9, %xmm12
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm9 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
@@ -8803,7 +8803,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15>
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm7, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm2, %xmm11
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm1, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm3, %xmm8
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7]
@@ -8820,7 +8820,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm9, %xmm1, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm10 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5>
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm1, %xmm1
@@ -8919,7 +8919,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm9[4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm9, %xmm0, %xmm15
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm15[2],xmm7[3],xmm15[4,5],xmm7[6,7]
@@ -9001,7 +9001,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm1, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm7, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm0, %xmm10
 ; AVX2-FAST-PERLANE-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm4[2,1,2,0,4,5,6,7]
@@ -9111,7 +9111,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    # xmm9 = mem[0,1,2,3,7,5,6,5]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm9[4],xmm1[5,6],xmm9[7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm9, %xmm7, %xmm7
 ; AVX2-FAST-PERLANE-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm7[1,2],xmm4[3],xmm7[4,5,6,7]
@@ -9171,7 +9171,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm0, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm7, %xmm5, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm8 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u>
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm8, %xmm2, %xmm9
@@ -9212,7 +9212,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm8, %xmm5, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u>
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
@@ -9425,7 +9425,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm13 = <u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u>
+; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm13 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %xmm13, %xmm15, %xmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm15 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %xmm15, %xmm14, %xmm14
@@ -9709,7 +9709,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm4[0,1,0,2,4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %xmm4, %xmm22
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
+; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %xmm10, %xmm5, %xmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %xmm5, %xmm18
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6,7]
@@ -9809,7 +9809,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufd {{.*#+}} xmm10 = xmm13[1,1,2,3]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5]
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendw {{.*#+}} xmm10 = xmm2[0],xmm10[1],xmm2[2,3],xmm10[4],xmm2[5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
+; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %xmm18, %xmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %xmm22, %xmm1
@@ -9919,7 +9919,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm10, %xmm2, %xmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm11 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0]
 ; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm2, %xmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm2, %ymm20
 ; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3]
@@ -9931,7 +9931,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 512(%rdi), %ymm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm9, %xmm15, %xmm1
 ; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm15, %xmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5>
@@ -10053,7 +10053,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm9, %xmm13, %xmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = <u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm13 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm13, %xmm7, %xmm7
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm7 = xmm0[0,1],xmm7[2],xmm0[3],xmm7[4,5],xmm0[6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7>
@@ -10140,7 +10140,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm3, %xmm2, %xmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm2, %xmm20
@@ -10254,7 +10254,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4],xmm15[5,6],xmm14[7]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
 ; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $2, %xmm11, %zmm14, %zmm24
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm11 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm20, %xmm0
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm11, %xmm0, %xmm15
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm11, %xmm20
@@ -10330,7 +10330,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7]
 ; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm8, %xmm3, %xmm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm3, %xmm25
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm15 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u>
@@ -10426,7 +10426,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm1[5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm0, %ymm16
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm3, %xmm12, %xmm8
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm12, %xmm11, %xmm11
@@ -10440,7 +10440,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm3, %xmm0, %xmm11
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm12, %xmm10, %xmm10
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm25, %xmm0
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u>
@@ -10677,7 +10677,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vpternlogq $226, %zmm17, %zmm16, %zmm6
 ; AVX512DQ-SLOW-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u>
+; AVX512DQ-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %xmm12, %xmm14, %xmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
 ; AVX512DQ-SLOW-NEXT:    vpshufb %xmm14, %xmm13, %xmm13
@@ -10954,7 +10954,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm4[0,1,0,2,4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %xmm4, %xmm19
 ; AVX512DQ-SLOW-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6]
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
+; AVX512DQ-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm11 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %xmm11, %xmm5, %xmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %xmm5, %xmm18
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7]
@@ -11053,7 +11053,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vpshufd {{.*#+}} xmm11 = xmm13[1,1,2,3]
 ; AVX512DQ-SLOW-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5]
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm11 = xmm3[0],xmm11[1],xmm3[2,3],xmm11[4],xmm3[5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
+; AVX512DQ-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %xmm18, %xmm0
 ; AVX512DQ-SLOW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %xmm19, %xmm13
@@ -11145,7 +11145,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm9, %xmm2, %xmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm11 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0]
 ; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm2, %xmm1
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm2, %ymm25
 ; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3]
@@ -11157,7 +11157,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vmovdqa 512(%rdi), %ymm2
 ; AVX512DQ-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = <u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u>
+; AVX512DQ-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm13 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm13, %xmm2, %xmm1
 ; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm2, %xmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm2, %ymm23
@@ -11283,7 +11283,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm13, %xmm12, %xmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm12, %xmm7, %xmm7
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm7 = xmm0[0,1],xmm7[2],xmm0[3],xmm7[4,5],xmm0[6,7]
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7>
@@ -11371,7 +11371,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
 ; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
 ; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm3, %xmm2, %xmm1
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm2, %xmm24
@@ -11485,7 +11485,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4],xmm15[5,6],xmm14[7]
 ; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
 ; AVX512DQ-FAST-NEXT:    vinserti32x4 $2, %xmm11, %zmm14, %zmm20
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm11 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm24, %xmm0
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm11, %xmm0, %xmm15
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm11, %xmm22
@@ -11552,7 +11552,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
 ; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[0,3,2,1]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
+; AVX512DQ-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm4, %xmm3, %xmm2
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm3, %xmm31
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u>
@@ -11647,7 +11647,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm25
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm1, %xmm13, %xmm9
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm13, %xmm11, %xmm11
@@ -11660,7 +11660,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm1, %xmm15, %xmm11
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm13, %xmm10, %xmm10
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
+; AVX512DQ-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm31, %xmm4
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm10, %xmm4, %xmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u>

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
index e5674bc467e0d..56f269c2441c5 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
@@ -270,7 +270,7 @@ define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FAST-NEXT:    vpsrlq $48, %xmm1, %xmm8
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
 ; AVX512BW-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <6,13,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [6,13,6,13,6,13,6,13]
 ; AVX512BW-FAST-NEXT:    vpermi2w %xmm1, %xmm0, %xmm8
 ; AVX512BW-FAST-NEXT:    vmovd %xmm2, (%rsi)
 ; AVX512BW-FAST-NEXT:    vmovd %xmm4, (%rdx)
@@ -688,21 +688,21 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,7,14,21,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [0,7,14,21,0,7,14,21]
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm1
 ; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,8,15,22,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,8,15,22,1,8,15,22]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = <2,9,16,23,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [2,9,16,23,2,9,16,23]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm4
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm5 = <3,10,17,24,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [3,10,17,24,3,10,17,24]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = <4,11,18,25,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [4,11,18,25,4,11,18,25]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm6
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm7 = <5,12,19,26,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [5,12,19,26,5,12,19,26]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm7
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm8 = <6,13,20,27,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [6,13,20,27,6,13,20,27]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm8
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512BW-NEXT:    vmovq %xmm3, (%rdx)
@@ -1389,7 +1389,7 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm2[6],xmm9[7]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm11 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm11, %xmm10, %xmm12
 ; AVX512F-FAST-NEXT:    vextracti128 $1, %ymm10, %xmm10
 ; AVX512F-FAST-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7]
@@ -2352,7 +2352,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqa 160(%rdi), %ymm5
 ; AVX2-FAST-NEXT:    vmovdqa 192(%rdi), %ymm7
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm8 = ymm7[0,1,0,2]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
 ; AVX2-FAST-NEXT:    vpshufb %ymm9, %ymm8, %ymm6
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
 ; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [3,6,2,5,3,6,2,5]
@@ -2453,7 +2453,8 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [2,5,1,4,2,5,1,4]
 ; AVX2-FAST-NEXT:    # ymm15 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm13, %ymm15, %ymm13
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,u,u,0,3,7,u>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [0,3,7,0,0,3,7,0]
+; AVX2-FAST-NEXT:    # ymm15 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm7, %ymm15, %ymm15
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u]
@@ -2521,7 +2522,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm7 = ymm10[0,1,0,2]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm9, %ymm7, %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm8[7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7]
@@ -2839,153 +2840,307 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-SLOW-NEXT:    vzeroupper
 ; AVX512F-SLOW-NEXT:    retq
 ;
-; AVX512F-FAST-LABEL: load_i16_stride7_vf16:
-; AVX512F-FAST:       # %bb.0:
-; AVX512F-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm0
-; AVX512F-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm1
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [10,3,6,15,12,13,6,15]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u>
-; AVX512F-FAST-NEXT:    vpermd %zmm0, %zmm2, %zmm10
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <2,5,9,u,12,u,u,u>
-; AVX512F-FAST-NEXT:    vpermd %zmm0, %zmm2, %zmm6
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15]
-; AVX512F-FAST-NEXT:    vpermd %zmm0, %zmm2, %zmm3
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,6,10,13,3,6,10,13]
-; AVX512F-FAST-NEXT:    vpermd %zmm1, %zmm2, %zmm4
-; AVX512F-FAST-NEXT:    vmovdqa 192(%rdi), %ymm2
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm13 = ymm2[0,1,0,2]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27>
-; AVX512F-FAST-NEXT:    vpshufb %ymm9, %ymm13, %ymm5
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u]
-; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5,6],ymm5[7]
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX512F-FAST-NEXT:    vmovdqa 32(%rdi), %ymm5
-; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
-; AVX512F-FAST-NEXT:    vextracti128 $1, %ymm12, %xmm14
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm14[4],xmm12[5],xmm14[6],xmm12[7]
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-FAST-NEXT:    vpor %ymm3, %ymm12, %ymm3
-; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm11[5,6,7]
-; AVX512F-FAST-NEXT:    vmovdqa 160(%rdi), %ymm11
-; AVX512F-FAST-NEXT:    vmovdqa 128(%rdi), %ymm12
-; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm14 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7]
-; AVX512F-FAST-NEXT:    vextracti128 $1, %ymm14, %xmm15
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3],xmm15[4],xmm14[5],xmm15[6],xmm14[7]
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
-; AVX512F-FAST-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
-; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm14 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
-; AVX512F-FAST-NEXT:    vextracti128 $1, %ymm14, %xmm15
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3,4,5],xmm14[6],xmm15[7]
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-FAST-NEXT:    vpor %ymm6, %ymm14, %ymm6
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15]
-; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7]
-; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm13 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7]
-; AVX512F-FAST-NEXT:    vextracti128 $1, %ymm13, %xmm14
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3,4,5],xmm13[6],xmm14[7]
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
-; AVX512F-FAST-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [2,5,2,5,2,5,2,5]
-; AVX512F-FAST-NEXT:    vpermd %ymm2, %ymm14, %ymm14
-; AVX512F-FAST-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
-; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7]
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
-; AVX512F-FAST-NEXT:    vextracti128 $1, %ymm14, %xmm15
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7]
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-FAST-NEXT:    vpor %ymm10, %ymm14, %ymm10
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm13 = ymm10[0],ymm13[1,2,3,4,5,6,7],ymm10[8],ymm13[9,10,11,12,13,14,15]
-; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
-; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm13 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7]
-; AVX512F-FAST-NEXT:    vextracti128 $1, %ymm13, %xmm14
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7]
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
-; AVX512F-FAST-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm14 = ymm2[0,1,1,3]
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
-; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
-; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
-; AVX512F-FAST-NEXT:    vextracti128 $1, %ymm15, %xmm7
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm15[1],xmm7[2],xmm15[3],xmm7[4,5,6,7]
-; AVX512F-FAST-NEXT:    vpermd %zmm0, %zmm8, %zmm8
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-FAST-NEXT:    vpor %ymm7, %ymm8, %ymm7
-; AVX512F-FAST-NEXT:    vpermd %zmm1, %zmm16, %zmm15
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <0,3,7,10,14,u,u,u>
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm13 = ymm7[0],ymm13[1,2,3,4,5,6,7],ymm7[8],ymm13[9,10,11,12,13,14,15]
-; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7]
-; AVX512F-FAST-NEXT:    vpermd %zmm0, %zmm8, %zmm8
-; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7]
-; AVX512F-FAST-NEXT:    vextracti128 $1, %ymm11, %xmm12
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7]
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
-; AVX512F-FAST-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512F-FAST-NEXT:    vpshufb %ymm9, %ymm14, %ymm9
-; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7]
-; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512F-FAST-NEXT:    vpshufb %xmm12, %xmm11, %xmm13
-; AVX512F-FAST-NEXT:    vextracti128 $1, %ymm11, %xmm11
-; AVX512F-FAST-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
-; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31>
-; AVX512F-FAST-NEXT:    vpshufb %ymm13, %ymm8, %ymm8
-; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm9 = ymm8[0],ymm9[1,2,3,4,5,6,7],ymm8[8],ymm9[9,10,11,12,13,14,15]
-; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <0,3,3,u,0,3,7,u>
-; AVX512F-FAST-NEXT:    vpermd %ymm2, %ymm9, %ymm9
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm11 = ymm15[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7],ymm11[8,9,10,11,12],ymm9[13,14,15]
-; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
-; AVX512F-FAST-NEXT:    vextracti128 $1, %ymm11, %xmm14
-; AVX512F-FAST-NEXT:    vpshufb %xmm12, %xmm14, %xmm12
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <1,4,8,11,15,u,u,u>
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13]
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u>
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
-; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX512F-FAST-NEXT:    vpermd %zmm0, %zmm16, %zmm12
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
-; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3,4,5,6,7]
-; AVX512F-FAST-NEXT:    vpermd %zmm1, %zmm15, %zmm1
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1,2,3,4,5,6,7],ymm11[8],ymm9[9,10,11,12,13,14,15]
-; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [0,4,7,0,0,4,7,0]
-; AVX512F-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
-; AVX512F-FAST-NEXT:    vpermd %ymm2, %ymm11, %ymm2
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27]
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,2,3,4,5,10,11,12,13,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
-; AVX512F-FAST-NEXT:    vpermd %zmm0, %zmm14, %zmm0
-; AVX512F-FAST-NEXT:    vpshufb %ymm13, %ymm0, %ymm0
-; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
-; AVX512F-FAST-NEXT:    vextracti128 $1, %ymm2, %xmm4
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
-; AVX512F-FAST-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7]
-; AVX512F-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
-; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-FAST-NEXT:    vmovdqa %ymm3, (%rsi)
-; AVX512F-FAST-NEXT:    vmovdqa %ymm6, (%rdx)
-; AVX512F-FAST-NEXT:    vmovdqa %ymm10, (%rcx)
-; AVX512F-FAST-NEXT:    vmovdqa %ymm7, (%r8)
-; AVX512F-FAST-NEXT:    vmovdqa %ymm8, (%r9)
-; AVX512F-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-FAST-NEXT:    vmovdqa %ymm9, (%rax)
-; AVX512F-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-FAST-NEXT:    vmovdqa %ymm0, (%rax)
-; AVX512F-FAST-NEXT:    vzeroupper
-; AVX512F-FAST-NEXT:    retq
+; AVX512F-ONLY-FAST-LABEL: load_i16_stride7_vf16:
+; AVX512F-ONLY-FAST:       # %bb.0:
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12]
+; AVX512F-ONLY-FAST-NEXT:    # ymm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [10,3,6,15,12,13,6,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm0, %zmm2, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <2,5,9,u,12,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm0, %zmm2, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15]
+; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm0, %zmm2, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [3,6,10,13,3,6,10,13]
+; AVX512F-ONLY-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm1, %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rdi), %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm13 = ymm2[0,1,0,2]
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm9, %ymm13, %ymm5
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5,6],ymm5[7]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdi), %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
+; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm12, %xmm14
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm14[4],xmm12[5],xmm14[6],xmm12[7]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT:    vpor %ymm3, %ymm12, %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm11[5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 160(%rdi), %ymm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rdi), %ymm12
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm14 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7]
+; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm14, %xmm15
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3],xmm15[4],xmm14[5],xmm15[6],xmm14[7]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm14 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
+; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm14, %xmm15
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3,4,5],xmm14[6],xmm15[7]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT:    vpor %ymm6, %ymm14, %ymm6
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15]
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm13 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7]
+; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm13, %xmm14
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3,4,5],xmm13[6],xmm14[7]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [2,5,2,5,2,5,2,5]
+; AVX512F-ONLY-FAST-NEXT:    vpermd %ymm2, %ymm14, %ymm14
+; AVX512F-ONLY-FAST-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm14, %xmm15
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT:    vpor %ymm10, %ymm14, %ymm10
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm13 = ymm10[0],ymm13[1,2,3,4,5,6,7],ymm10[8],ymm13[9,10,11,12,13,14,15]
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm13 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm13, %xmm14
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm14 = ymm2[0,1,1,3]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
+; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm15, %xmm7
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm15[1],xmm7[2],xmm15[3],xmm7[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm0, %zmm8, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT:    vpor %ymm7, %ymm8, %ymm7
+; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm1, %zmm16, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <0,3,7,10,14,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm13 = ymm7[0],ymm13[1,2,3,4,5,6,7],ymm7[8],ymm13[9,10,11,12,13,14,15]
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm0, %zmm8, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7]
+; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm11, %xmm12
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm9, %ymm14, %ymm9
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm12, %xmm11, %xmm13
+; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm11, %xmm11
+; AVX512F-ONLY-FAST-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31>
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm13, %ymm8, %ymm8
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm9 = ymm8[0],ymm9[1,2,3,4,5,6,7],ymm8[8],ymm9[9,10,11,12,13,14,15]
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <0,3,3,u,0,3,7,u>
+; AVX512F-ONLY-FAST-NEXT:    vpermd %ymm2, %ymm9, %ymm9
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm11 = ymm15[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7],ymm11[8,9,10,11,12],ymm9[13,14,15]
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
+; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm11, %xmm14
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm12, %xmm14, %xmm12
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <1,4,8,11,15,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13]
+; AVX512F-ONLY-FAST-NEXT:    # ymm15 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
+; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm0, %zmm16, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm1, %zmm15, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1,2,3,4,5,6,7],ymm11[8],ymm9[9,10,11,12,13,14,15]
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [0,4,7,0,0,4,7,0]
+; AVX512F-ONLY-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermd %ymm2, %ymm11, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,2,3,4,5,10,11,12,13,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
+; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm0, %zmm14, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm13, %ymm0, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
+; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %ymm3, (%rsi)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %ymm6, (%rdx)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %ymm10, (%rcx)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %ymm7, (%r8)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %ymm8, (%r9)
+; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %ymm9, (%rax)
+; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %ymm0, (%rax)
+; AVX512F-ONLY-FAST-NEXT:    vzeroupper
+; AVX512F-ONLY-FAST-NEXT:    retq
+;
+; AVX512DQ-FAST-LABEL: load_i16_stride7_vf16:
+; AVX512DQ-FAST:       # %bb.0:
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12]
+; AVX512DQ-FAST-NEXT:    # ymm16 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [10,3,6,15,12,13,6,15]
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u>
+; AVX512DQ-FAST-NEXT:    vpermd %zmm0, %zmm2, %zmm10
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <2,5,9,u,12,u,u,u>
+; AVX512DQ-FAST-NEXT:    vpermd %zmm0, %zmm2, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15]
+; AVX512DQ-FAST-NEXT:    vpermd %zmm0, %zmm2, %zmm3
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [3,6,10,13,3,6,10,13]
+; AVX512DQ-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vpermd %zmm1, %zmm2, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rdi), %ymm2
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm13 = ymm2[0,1,0,2]
+; AVX512DQ-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm9, %ymm13, %ymm5
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u]
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5,6],ymm5[7]
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %ymm4
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
+; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm12, %xmm14
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm14[4],xmm12[5],xmm14[6],xmm12[7]
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FAST-NEXT:    vpor %ymm3, %ymm12, %ymm3
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm11[5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa 160(%rdi), %ymm11
+; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rdi), %ymm12
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm14 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7]
+; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm14, %xmm15
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3],xmm15[4],xmm14[5],xmm15[6],xmm14[7]
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm14 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
+; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm14, %xmm15
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3,4,5],xmm14[6],xmm15[7]
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FAST-NEXT:    vpor %ymm6, %ymm14, %ymm6
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15]
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm13 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7]
+; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm13, %xmm14
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3,4,5],xmm13[6],xmm14[7]
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [2,5,2,5,2,5,2,5]
+; AVX512DQ-FAST-NEXT:    vpermd %ymm2, %ymm14, %ymm14
+; AVX512DQ-FAST-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7]
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
+; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm14, %xmm15
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7]
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FAST-NEXT:    vpor %ymm10, %ymm14, %ymm10
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm13 = ymm10[0],ymm13[1,2,3,4,5,6,7],ymm10[8],ymm13[9,10,11,12,13,14,15]
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm13 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7]
+; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm13, %xmm14
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7]
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm14 = ymm2[0,1,1,3]
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
+; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm15, %xmm7
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm15[1],xmm7[2],xmm15[3],xmm7[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpermd %zmm0, %zmm8, %zmm8
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FAST-NEXT:    vpor %ymm7, %ymm8, %ymm7
+; AVX512DQ-FAST-NEXT:    vpermd %zmm1, %zmm16, %zmm15
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <0,3,7,10,14,u,u,u>
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm13 = ymm7[0],ymm13[1,2,3,4,5,6,7],ymm7[8],ymm13[9,10,11,12,13,14,15]
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpermd %zmm0, %zmm8, %zmm8
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7]
+; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm11, %xmm12
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm9, %ymm14, %ymm9
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7]
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm12, %xmm11, %xmm13
+; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm11, %xmm11
+; AVX512DQ-FAST-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31>
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm13, %ymm8, %ymm8
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm9 = ymm8[0],ymm9[1,2,3,4,5,6,7],ymm8[8],ymm9[9,10,11,12,13,14,15]
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <0,3,3,u,0,3,7,u>
+; AVX512DQ-FAST-NEXT:    vpermd %ymm2, %ymm9, %ymm9
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm11 = ymm15[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7],ymm11[8,9,10,11,12],ymm9[13,14,15]
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
+; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm11, %xmm14
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm12, %xmm14, %xmm12
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <1,4,8,11,15,u,u,u>
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13]
+; AVX512DQ-FAST-NEXT:    # ymm15 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u>
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
+; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
+; AVX512DQ-FAST-NEXT:    vpermd %zmm0, %zmm16, %zmm12
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpermd %zmm1, %zmm15, %zmm1
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1,2,3,4,5,6,7],ymm11[8],ymm9[9,10,11,12,13,14,15]
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [0,4,7,0,0,4,7,0]
+; AVX512DQ-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vpermd %ymm2, %ymm11, %ymm2
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27]
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,2,3,4,5,10,11,12,13,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
+; AVX512DQ-FAST-NEXT:    vpermd %zmm0, %zmm14, %zmm0
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm13, %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
+; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7]
+; AVX512DQ-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa %ymm3, (%rsi)
+; AVX512DQ-FAST-NEXT:    vmovdqa %ymm6, (%rdx)
+; AVX512DQ-FAST-NEXT:    vmovdqa %ymm10, (%rcx)
+; AVX512DQ-FAST-NEXT:    vmovdqa %ymm7, (%r8)
+; AVX512DQ-FAST-NEXT:    vmovdqa %ymm8, (%r9)
+; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT:    vmovdqa %ymm9, (%rax)
+; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT:    vmovdqa %ymm0, (%rax)
+; AVX512DQ-FAST-NEXT:    vzeroupper
+; AVX512DQ-FAST-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i16_stride7_vf16:
 ; AVX512BW:       # %bb.0:
@@ -2995,42 +3150,49 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,6,13,20,27,34,41>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,13,20,27,34,41,0,0,6,13,20,27,34,41]
+; AVX512BW-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm4
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm5 = <0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u>
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm5
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,0,7,14,21,28,35,42>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,0,7,14,21,28,35,42,0,0,7,14,21,28,35,42]
+; AVX512BW-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm5
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = <1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u>
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm6
 ; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6,7],ymm6[8],ymm5[9,10,11,12,13,14,15]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,1,8,15,22,29,36,43>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [0,1,8,15,22,29,36,43,0,1,8,15,22,29,36,43]
+; AVX512BW-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm6
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = <2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u>
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm7
 ; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,2,9,16,23,30,37,44>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,2,9,16,23,30,37,44,0,2,9,16,23,30,37,44]
+; AVX512BW-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm7
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm8 = <3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u>
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm8
 ; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,3,10,17,24,31,38,45>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,3,10,17,24,31,38,45,0,3,10,17,24,31,38,45]
+; AVX512BW-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm8
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = <36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u>
 ; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm9
 ; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,u,4,11,18,25,32,39,46>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [0,4,11,18,25,32,39,46,0,4,11,18,25,32,39,46]
+; AVX512BW-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm9
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm10 = <37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u>
 ; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm10
 ; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6,7],ymm10[8],ymm9[9,10,11,12,13,14,15]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,5,12,19,26,33,40,47>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [0,5,12,19,26,33,40,47,0,5,12,19,26,33,40,47]
+; AVX512BW-NEXT:    # ymm10 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm10
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = <38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u>
 ; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm2
@@ -5026,7 +5188,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshufb %ymm4, %ymm2, %ymm14
 ; AVX2-FAST-NEXT:    vmovdqa 416(%rdi), %ymm7
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm7[0,1,0,2]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
 ; AVX2-FAST-NEXT:    vpshufb %ymm6, %ymm0, %ymm13
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
 ; AVX2-FAST-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload
@@ -5054,7 +5216,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
 ; AVX2-FAST-NEXT:    vpshufb %xmm11, %xmm4, %xmm4
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
 ; AVX2-FAST-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
@@ -5106,7 +5268,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm4 = ymm7[0,1,1,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
 ; AVX2-FAST-NEXT:    vpshufb %ymm8, %ymm4, %ymm11
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15]
@@ -5180,7 +5342,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,3,3,3,0,3,7,7]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpermd %ymm13, %ymm2, %ymm4
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
 ; AVX2-FAST-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7]
 ; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [2,5,1,4,2,5,1,4]
@@ -5231,12 +5393,12 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,4,7,0,0,4,7,0]
 ; AVX2-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm13, %ymm1, %ymm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm13 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
 ; AVX2-FAST-NEXT:    vpshufb %ymm13, %ymm3, %ymm3
 ; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [2,6,1,5,2,6,1,5]
 ; AVX2-FAST-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm6, %ymm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27]
 ; AVX2-FAST-NEXT:    vpshufb %ymm7, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
@@ -5426,7 +5588,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm15
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 416(%rdi), %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm7[0,1,0,2]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm3, %ymm1, %ymm14
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd $31, (%rsp), %ymm14, %ymm0 # 32-byte Folded Reload
@@ -5453,7 +5615,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
@@ -5478,7 +5640,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm3 = ymm7[0,1,1,2]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,30,31>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} ymm12 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm12, %ymm3, %ymm3
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
@@ -5503,7 +5665,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm3 = ymm7[0,1,1,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm7, %ymm3, %ymm9
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15]
@@ -5544,7 +5706,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm2, %ymm3, %ymm3
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15]
@@ -6090,11 +6252,14 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST:       # %bb.0:
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm29
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm30
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,6,9,13,2,6,9,13]
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [2,6,9,13,2,6,9,13]
+; AVX512F-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm25
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm18 = [2,5,9,12,2,5,9,12]
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm18 = [2,5,9,12,2,5,9,12]
+; AVX512F-ONLY-FAST-NEXT:    # ymm18 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm19 = [10,3,6,15,12,13,6,15]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm22 = [3,6,10,13,3,6,10,13]
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm22 = [3,6,10,13,3,6,10,13]
+; AVX512F-ONLY-FAST-NEXT:    # ymm22 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm23
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm24 = <1,u,u,u,5,8,12,15>
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <2,6,9,u,13,u,u,u>
@@ -6139,7 +6304,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm13 = xmm0[0,1,2],xmm13[3,4,5,6],xmm0[7]
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31>
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm13 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm13, %ymm4, %ymm4
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm0, %ymm21
@@ -6292,7 +6457,8 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm16, %zmm0, %zmm14
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm24, %zmm0, %zmm24
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,6,9,13,2,6,9,13]
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [2,6,9,13,2,6,9,13]
+; AVX512F-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm25, %zmm1, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u>
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm25
@@ -6757,13 +6923,16 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST:       # %bb.0:
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm28
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm30
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm31 = [2,6,9,13,2,6,9,13]
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm31 = [2,6,9,13,2,6,9,13]
+; AVX512DQ-FAST-NEXT:    # ymm31 = mem[0,1,0,1]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm24
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm19 = [2,5,9,12,2,5,9,12]
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm19 = [2,5,9,12,2,5,9,12]
+; AVX512DQ-FAST-NEXT:    # ymm19 = mem[0,1,0,1]
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [10,3,6,15,12,13,6,15]
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm30, %zmm2, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm18 = [3,6,10,13,3,6,10,13]
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm18 = [3,6,10,13,3,6,10,13]
+; AVX512DQ-FAST-NEXT:    # ymm18 = mem[0,1,0,1]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm20
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm21 = <1,u,u,u,5,8,12,15>
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u>
@@ -6808,7 +6977,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3,4,5,6],xmm14[7]
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31>
+; AVX512DQ-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm7, %ymm13, %ymm13
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm25
@@ -11195,7 +11364,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqa 192(%rdi), %ymm2
 ; AVX2-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm5 = ymm2[0,1,0,2]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
 ; AVX2-FAST-NEXT:    vpshufb %ymm9, %ymm5, %ymm4
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7]
 ; AVX2-FAST-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
@@ -11249,7 +11418,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
 ; AVX2-FAST-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm15 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
 ; AVX2-FAST-NEXT:    vpshufb %ymm15, %ymm5, %ymm5
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
@@ -11364,7 +11533,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm5[0,1,1,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm5
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
@@ -11416,7 +11585,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
 ; AVX2-FAST-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
 ; AVX2-FAST-NEXT:    vpshufb %ymm8, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
@@ -11539,13 +11708,13 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vmovdqa %ymm1, %ymm3
 ; AVX2-FAST-NEXT:    vpermd %ymm13, %ymm1, %ymm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
 ; AVX2-FAST-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm10[2],ymm6[3,4],ymm10[5],ymm6[6,7]
 ; AVX2-FAST-NEXT:    vpermd %ymm5, %ymm11, %ymm5
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25]
 ; AVX2-FAST-NEXT:    vpshufb %ymm1, %ymm5, %ymm5
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6,7],ymm5[8,9,10,11,12],ymm0[13,14,15]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7]
@@ -11617,12 +11786,12 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,4,7,0,0,4,7,0]
 ; AVX2-FAST-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm13, %ymm0, %ymm2
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
 ; AVX2-FAST-NEXT:    vpshufb %ymm10, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [2,6,1,5,2,6,1,5]
 ; AVX2-FAST-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm9, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm15 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27]
 ; AVX2-FAST-NEXT:    vpshufb %ymm15, %ymm1, %ymm1
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
@@ -12003,7 +12172,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 192(%rdi), %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm12 = ymm1[0,1,0,2]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} ymm14 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm14, %ymm12, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
@@ -12067,7 +12236,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm14, %xmm4, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
@@ -12119,7 +12288,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm6[0,1,1,2]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,30,31>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
@@ -12178,7 +12347,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm4 = ymm6[0,1,1,3]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm2, %ymm4, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
@@ -12245,7 +12414,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} ymm13 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm13, %ymm4, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15]
@@ -13048,7 +13217,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa %ymm2, %ymm14
 ; AVX512F-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1]
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6],ymm8[7,8],ymm1[9,10,11,12,13,14],ymm8[15]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u>
+; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %ymm9, %ymm1, %ymm1
@@ -13581,7 +13750,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm4, %xmm6, %xmm6
 ; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29>
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm7, %ymm9, %ymm8
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7]
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm9 = xmm14[0],xmm13[1],xmm14[2,3,4,5,6,7]
@@ -13703,11 +13872,11 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm9, %xmm3, %xmm3
 ; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm15
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25>
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
 ; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm3 = ymm18[0,1,1,3]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm10, %ymm3, %ymm6
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3,4,5,6],ymm6[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm15 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11]
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm15 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm15, %xmm4, %xmm4
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm4, %zmm6, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -13738,7 +13907,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm7, %xmm5, %xmm5
 ; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27>
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7]
 ; AVX512F-ONLY-FAST-NEXT:    vpbroadcastw 232(%rdi), %xmm6
@@ -13749,7 +13918,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm21, %ymm2
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm5, %ymm2, %ymm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 576(%rdi), %zmm19
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm25 = [3,6,10,13,3,6,10,13]
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm25 = [3,6,10,13,3,6,10,13]
+; AVX512F-ONLY-FAST-NEXT:    # ymm25 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm19, %zmm25, %zmm6
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u>
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm11, %ymm6, %ymm6
@@ -13775,7 +13945,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm2, %ymm0, %ymm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [2,5,9,12,2,5,9,12]
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [2,5,9,12,2,5,9,12]
+; AVX512F-ONLY-FAST-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm1, %zmm8, %zmm7
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm4, %ymm7, %ymm7
@@ -13792,7 +13963,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm0, %ymm27
 ; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm3, %xmm7
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3],xmm7[4],xmm3[5],xmm7[6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm0, %xmm16
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 768(%rdi), %zmm29
@@ -13825,7 +13996,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm3, %ymm23
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [2,6,9,13,2,6,9,13]
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [2,6,9,13,2,6,9,13]
+; AVX512F-ONLY-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm1, %zmm11, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm0, %ymm4, %ymm4
@@ -13839,7 +14011,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 832(%rdi), %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm3, %zmm8, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31>
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm9, %ymm4, %ymm4
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm4[6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -13864,7 +14036,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa %ymm1, %ymm5
 ; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm7, %xmm8
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5],xmm7[6],xmm8[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <1,u,u,u,4,8,11,15>
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm16, %zmm15, %zmm0
@@ -13873,7 +14045,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3,4,5,6],xmm0[7]
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm14, %zmm11, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29>
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm1, %ymm7, %ymm7
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -14655,7 +14827,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm4[0],ymm11[1],ymm4[2,3],ymm11[4],ymm4[5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm14 = ymm12[2,3,0,1]
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1,2,3,4,5,6],ymm14[7,8],ymm12[9,10,11,12,13,14],ymm14[15]
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u>
+; AVX512DQ-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm14 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %ymm7, %ymm12, %ymm12
@@ -15182,7 +15354,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
 ; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm6
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29>
+; AVX512DQ-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm4, %ymm9, %ymm7
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3,4,5,6],ymm7[7]
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm8 = xmm0[0],xmm12[1],xmm0[2,3,4,5,6,7]
@@ -15305,11 +15477,11 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
 ; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm12
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25>
+; AVX512DQ-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
 ; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm3 = ymm23[0,1,1,3]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm10, %ymm3, %ymm14
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm14 = ymm12[0,1,2,3,4,5,6],ymm14[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11]
+; AVX512DQ-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm12 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm12, %xmm4, %xmm4
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm4, %zmm14, %zmm21
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm15[0],ymm13[1],ymm15[2,3,4],ymm13[5],ymm15[6,7]
@@ -15338,7 +15510,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm5, %xmm6, %xmm6
 ; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27>
+; AVX512DQ-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm12 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm12, %ymm3, %ymm3
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7]
 ; AVX512DQ-FAST-NEXT:    vpbroadcastw 232(%rdi), %xmm6
@@ -15349,7 +15521,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm25, %ymm1
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm12, %ymm1, %ymm3
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 576(%rdi), %zmm19
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,6,10,13,3,6,10,13]
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [3,6,10,13,3,6,10,13]
+; AVX512DQ-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm19, %zmm1, %zmm6
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm13
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u>
@@ -15376,7 +15549,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm1
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [2,5,9,12,2,5,9,12]
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [2,5,9,12,2,5,9,12]
+; AVX512DQ-FAST-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm0, %zmm6, %zmm5
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
@@ -15393,7 +15567,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm5, %ymm20
 ; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm1, %xmm5
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3],xmm5[4],xmm1[5],xmm5[6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u>
+; AVX512DQ-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm14 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 768(%rdi), %zmm28
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm16 = <0,u,u,u,4,7,11,14>
@@ -15423,7 +15597,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    # ymm12 = mem[0,1,0,1]
 ; AVX512DQ-FAST-NEXT:    vpermd %ymm23, %ymm12, %ymm2
 ; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [2,6,9,13,2,6,9,13]
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [2,6,9,13,2,6,9,13]
+; AVX512DQ-FAST-NEXT:    # ymm10 = mem[0,1,0,1]
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm0, %zmm10, %zmm0
 ; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29]
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15]
@@ -15434,7 +15609,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 832(%rdi), %zmm0
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm0, %zmm6, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31>
+; AVX512DQ-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm8, %ymm2, %ymm2
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm2[6,7]
 ; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -15458,7 +15633,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vmovdqa %ymm2, %ymm5
 ; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm6, %xmm14
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm6 = xmm14[0,1,2,3],xmm6[4],xmm14[5],xmm6[6],xmm14[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u>
+; AVX512DQ-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm14 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm14, %xmm6, %xmm6
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <1,u,u,u,4,8,11,15>
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm17, %zmm7, %zmm15
@@ -15467,7 +15642,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3,4,5,6],xmm15[7]
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm15[4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm16, %zmm10, %zmm15
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29>
+; AVX512DQ-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm1, %ymm15, %ymm15
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm15[6,7]
 ; AVX512DQ-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -15496,7 +15671,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm14, %ymm6, %ymm6
 ; AVX512DQ-FAST-NEXT:    vpor %ymm6, %ymm1, %ymm1
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [3,6,10,13,3,6,10,13]
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [3,6,10,13,3,6,10,13]
+; AVX512DQ-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm16, %zmm5, %zmm6
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm8, %ymm6, %ymm6
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7]

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
index 9c533a00de734..7032733e67c49 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
@@ -271,23 +271,23 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,8,16,24,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [0,8,16,24,0,8,16,24]
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm1
 ; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,9,17,25,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,9,17,25,1,9,17,25]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = <2,10,18,26,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [2,10,18,26,2,10,18,26]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm4
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm5 = <3,11,19,27,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [3,11,19,27,3,11,19,27]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = <4,12,20,28,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [4,12,20,28,4,12,20,28]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm6
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm7 = <5,13,21,29,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [5,13,21,29,5,13,21,29]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm7
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm8 = <6,14,22,30,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [6,14,22,30,6,14,22,30]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm8
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm9 = <7,15,23,31,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [7,15,23,31,7,15,23,31]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm9
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512BW-NEXT:    vmovq %xmm3, (%rdx)
@@ -541,7 +541,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vmovdqa 80(%rdi), %xmm2
 ; AVX512F-SLOW-NEXT:    vmovdqa 64(%rdi), %xmm4
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,0,4>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [0,4,0,4]
 ; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm8 = xmm5[2,2,2,2]
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3]
@@ -560,7 +560,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3]
 ; AVX512F-SLOW-NEXT:    vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm15 = <3,7,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm15 = [3,7,3,7]
 ; AVX512F-SLOW-NEXT:    vpermt2d %xmm13, %xmm15, %xmm14
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
 ; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
@@ -602,7 +602,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vmovdqa 80(%rdi), %xmm5
 ; AVX512F-FAST-NEXT:    vmovdqa 64(%rdi), %xmm6
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,0,4>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4]
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm7, %xmm0
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm4, %xmm2, %xmm0
 ; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %xmm8
@@ -614,7 +614,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vpunpckldq {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm0, %xmm16
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm14 = <1,5,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm14 = [1,5,1,5]
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm13, %xmm15
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm12, %xmm14, %xmm15
 ; AVX512F-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
@@ -622,11 +622,11 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm0, %xmm17
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm7, %xmm15
 ; AVX512F-FAST-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,2,6>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6]
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm4, %xmm0, %xmm15
 ; AVX512F-FAST-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm15[2,3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm15 = <3,7,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm15 = [3,7,3,7]
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm12, %xmm15, %xmm13
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm7 = xmm13[0,1],xmm7[2,3]
 ; AVX512F-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
@@ -1285,7 +1285,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm1, %xmm26
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm0, %xmm27
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,0,4>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [0,4,0,4]
 ; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm29 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm8 = xmm5[2,2,2,2]
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm8[3]
@@ -1350,7 +1350,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3]
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm1, %ymm19
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = <3,7,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm17 = [3,7,3,7]
 ; AVX512F-SLOW-NEXT:    vpermt2d %xmm14, %xmm17, %xmm15
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm15[0,1],xmm13[2,3]
 ; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
@@ -1453,7 +1453,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm1, %xmm28
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm0, %xmm29
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,0,4>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [0,4,0,4]
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm11, %xmm1
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm14, %xmm7, %xmm1
 ; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %xmm0
@@ -1487,7 +1487,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm13[6,7]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm1, %ymm24
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = <1,5,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm13 = [1,5,1,5]
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm12, %xmm1
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm4, %xmm13, %xmm1
 ; AVX512F-FAST-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm11[0],xmm14[0],xmm11[1],xmm14[1]
@@ -1503,7 +1503,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm0, %ymm25
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm11, %xmm0
 ; AVX512F-FAST-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm11[2],xmm14[2],xmm11[3],xmm14[3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,2,6>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm11 = [2,6,2,6]
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm14, %xmm11, %xmm0
 ; AVX512F-FAST-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm12[2],xmm4[2],xmm12[3],xmm4[3]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm0[2,3]
@@ -1520,7 +1520,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm1, %ymm21
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} xmm19 = <3,7,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm19 = [3,7,3,7]
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm4, %xmm19, %xmm12
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3]
 ; AVX512F-FAST-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
@@ -1621,42 +1621,50 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,0,8,16,24,32,40,48,56>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
+; AVX512BW-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm4
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,8,16,24,32,40,48,56]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm5
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,1,9,17,25,33,41,49,57>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
+; AVX512BW-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm5
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [1,9,17,25,33,41,49,57]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm6
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,2,10,18,26,34,42,50,58>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
+; AVX512BW-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm6
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm7 = [2,10,18,26,34,42,50,58]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm7
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,3,11,19,27,35,43,51,59>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
+; AVX512BW-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm7
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm8 = [3,11,19,27,35,43,51,59]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm8
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,4,12,20,28,36,44,52,60>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
+; AVX512BW-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm8
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm9
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,5,13,21,29,37,45,53,61>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
+; AVX512BW-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm9
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm10 = [5,13,21,29,37,45,53,61]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm10
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,6,14,22,30,38,46,54,62>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
+; AVX512BW-NEXT:    # ymm10 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm10
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm11 = [6,14,22,30,38,46,54,62]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm11
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,7,15,23,31,39,47,55,63>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
+; AVX512BW-NEXT:    # ymm11 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm11
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
@@ -3071,7 +3079,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-SLOW-NEXT:    vmovdqa 320(%rdi), %xmm1
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,0,4>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4]
 ; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
@@ -3231,7 +3239,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm1, %zmm1
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = <3,7,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [3,7,3,7]
 ; AVX512F-SLOW-NEXT:    vpermt2d %xmm9, %xmm8, %xmm5
 ; AVX512F-SLOW-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload
 ; AVX512F-SLOW-NEXT:    # xmm1 = xmm5[0,1],mem[2,3]
@@ -3269,7 +3277,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm21 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; AVX512F-SLOW-NEXT:    vpunpckhdq {{.*#+}} xmm31 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm1, %xmm4
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,0,4>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4]
 ; AVX512F-SLOW-NEXT:    vpermt2d %xmm0, %xmm2, %xmm4
 ; AVX512F-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX512F-SLOW-NEXT:    vpunpckhwd (%rsp), %xmm0, %xmm3 # 16-byte Folded Reload
@@ -3397,7 +3405,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3]
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm2
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm12 = <3,7,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [3,7,3,7]
 ; AVX512F-SLOW-NEXT:    vpermt2d %xmm18, %xmm12, %xmm1
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm31, %xmm3
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
@@ -3454,7 +3462,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vmovdqa 320(%rdi), %xmm1
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,0,4>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4]
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm2, %xmm0
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm2, %xmm18
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm5, %xmm1, %xmm0
@@ -3549,7 +3557,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm15, %zmm0
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <1,5,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [1,5,1,5]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm21, %xmm0
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm20, %xmm1, %xmm0
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm1, %xmm13
@@ -3581,7 +3589,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm18, %xmm3
 ; AVX512F-FAST-NEXT:    vpunpckhdq {{.*#+}} xmm8 = xmm18[2],xmm5[2],xmm18[3],xmm5[3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,2,6>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6]
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm5, %xmm0, %xmm3
 ; AVX512F-FAST-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm21[2],xmm20[2],xmm21[3],xmm20[3]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
@@ -3617,7 +3625,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm1
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} xmm29 = <3,7,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm29 = [3,7,3,7]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm21, %xmm1
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm20, %xmm29, %xmm1
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3]
@@ -3655,7 +3663,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
 ; AVX512F-FAST-NEXT:    # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm6, %xmm1
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,0,4>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [0,4,0,4]
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm4, %xmm0, %xmm1
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm4, %xmm20
 ; AVX512F-FAST-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
@@ -3719,7 +3727,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm28
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm3, %xmm0
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <1,5,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [1,5,1,5]
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm30, %xmm8, %xmm0
 ; AVX512F-FAST-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm20[0],xmm6[1],xmm20[1]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
@@ -3747,7 +3755,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm6[2],xmm20[2],xmm6[3],xmm20[3]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm26
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,2,6>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6]
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm20, %xmm0, %xmm4
 ; AVX512F-FAST-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm30[2],xmm3[3],xmm30[3]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
@@ -3844,7 +3852,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    movb $-64, %dil
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,0,8,16,24,32,40,48,56>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
+; AVX512BW-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm9
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm8
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
@@ -3856,7 +3865,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm11
 ; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm9, %zmm11
 ; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm11 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,1,9,17,25,33,41,49,57>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
+; AVX512BW-NEXT:    # ymm10 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm10
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm9
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
@@ -3868,7 +3878,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm12
 ; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm10, %zmm12
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm12 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,2,10,18,26,34,42,50,58>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
+; AVX512BW-NEXT:    # ymm11 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm11
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm10
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
@@ -3880,7 +3891,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm13
 ; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm11, %zmm13
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm13 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,3,11,19,27,35,43,51,59>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
+; AVX512BW-NEXT:    # ymm12 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm12
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm11
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
@@ -3892,7 +3904,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm14
 ; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm12, %zmm14
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,4,12,20,28,36,44,52,60>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
+; AVX512BW-NEXT:    # ymm13 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm13
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm12
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
@@ -3904,7 +3917,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm15
 ; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm13, %zmm15
 ; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,5,13,21,29,37,45,53,61>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
+; AVX512BW-NEXT:    # ymm14 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm14
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm13
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
@@ -3916,7 +3930,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm16
 ; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm14, %zmm16
 ; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm16 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,u,u,u,u,u,u,6,14,22,30,38,46,54,62>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
+; AVX512BW-NEXT:    # ymm15 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm15
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm14
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
@@ -3926,7 +3941,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vpermt2w %zmm7, %zmm15, %zmm6
 ; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm15, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,7,15,23,31,39,47,55,63>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
+; AVX512BW-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm5
 ; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm15, %zmm0
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
@@ -6894,7 +6910,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-SLOW-NEXT:    vmovdqa 320(%rdi), %xmm0
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,0,4>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [0,4,0,4]
 ; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm5[2,2,2,2]
@@ -7288,7 +7304,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm16 = <3,7,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm16 = [3,7,3,7]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm23, %xmm0
 ; AVX512F-SLOW-NEXT:    vpermt2d %xmm20, %xmm16, %xmm0
 ; AVX512F-SLOW-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
@@ -7363,7 +7379,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-SLOW-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm6, %xmm0
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,0,4>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [0,4,0,4]
 ; AVX512F-SLOW-NEXT:    vpermt2d %xmm5, %xmm4, %xmm0
 ; AVX512F-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX512F-SLOW-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload
@@ -7691,7 +7707,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7]
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm16 = <3,7,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm16 = [3,7,3,7]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm21, %xmm1
 ; AVX512F-SLOW-NEXT:    vpermt2d %xmm20, %xmm16, %xmm1
 ; AVX512F-SLOW-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -7799,7 +7815,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vmovdqa 320(%rdi), %xmm1
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,0,4>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4]
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm3, %xmm0
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm3, %xmm16
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm2, %xmm1, %xmm0
@@ -8034,7 +8050,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm8, %zmm8
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <1,5,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm11 = [1,5,1,5]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm22, %xmm8
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm30, %xmm11, %xmm8
@@ -8105,7 +8121,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm16, %xmm1
 ; AVX512F-FAST-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm16[2],xmm20[2],xmm16[3],xmm20[3]
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,2,6>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6]
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm20, %xmm0, %xmm1
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm0, %xmm4
 ; AVX512F-FAST-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm22[2],xmm30[2],xmm22[3],xmm30[3]
@@ -8208,7 +8224,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} xmm18 = <3,7,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm18 = [3,7,3,7]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm27, %xmm0
 ; AVX512F-FAST-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm18, %xmm0 # 16-byte Folded Reload
 ; AVX512F-FAST-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
@@ -8281,7 +8297,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
 ; AVX512F-FAST-NEXT:    # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm9, %xmm0
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,0,4>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [0,4,0,4]
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm4, %xmm5, %xmm0
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm4, %xmm16
 ; AVX512F-FAST-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
@@ -8464,7 +8480,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm13, %zmm0
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm30, %xmm0
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm15 = <1,5,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm15 = [1,5,1,5]
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm17, %xmm15, %xmm0
 ; AVX512F-FAST-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm16[0],xmm9[1],xmm16[1]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
@@ -8530,7 +8546,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm9, %xmm1
 ; AVX512F-FAST-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm16[2],xmm9[3],xmm16[3]
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,2,6>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6]
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm16, %xmm0, %xmm1
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm17, %xmm27
 ; AVX512F-FAST-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload
@@ -8635,7 +8651,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm12, %zmm0
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm16, %xmm1
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} xmm24 = <3,7,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm24 = [3,7,3,7]
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm27, %xmm24, %xmm1
 ; AVX512F-FAST-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
 ; AVX512F-FAST-NEXT:    # xmm1 = xmm1[0,1],mem[2,3]
@@ -8729,282 +8745,1149 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vzeroupper
 ; AVX512F-FAST-NEXT:    retq
 ;
-; AVX512BW-LABEL: load_i16_stride8_vf64:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    subq $1032, %rsp # imm = 0x408
-; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm9
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm11
-; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm22
-; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm30
-; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm29
-; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm16
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm13
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm31
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm7
-; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm15
-; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm28
-; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm19
-; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm3
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10
-; AVX512BW-NEXT:    vpermt2w %zmm19, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm3
-; AVX512BW-NEXT:    vpermt2w %zmm15, %zmm0, %zmm3
-; AVX512BW-NEXT:    movb $-64, %al
-; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,0,8,16,24,32,40,48,56>
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2w %zmm7, %zmm1, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm5
-; AVX512BW-NEXT:    vpermt2w %zmm31, %zmm0, %zmm5
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm3
-; AVX512BW-NEXT:    vpermt2w %zmm29, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm4
-; AVX512BW-NEXT:    vpermt2w %zmm22, %zmm0, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-NEXT:    vpermi2w %zmm9, %zmm11, %zmm0
-; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm6, %zmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
-; AVX512BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512BW-NEXT:    vpermt2w %zmm19, %zmm5, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm1
-; AVX512BW-NEXT:    vpermt2w %zmm15, %zmm5, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,1,9,17,25,33,41,49,57>
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512BW-NEXT:    vpermt2w %zmm7, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm3
-; AVX512BW-NEXT:    vpermt2w %zmm31, %zmm5, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm1
-; AVX512BW-NEXT:    vpermt2w %zmm19, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
-; AVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm1
-; AVX512BW-NEXT:    vpermt2w %zmm19, %zmm2, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
-; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm3
-; AVX512BW-NEXT:    vpermt2w %zmm19, %zmm1, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm10
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
-; AVX512BW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512BW-NEXT:    vpermt2w %zmm19, %zmm12, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
-; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512BW-NEXT:    vpermt2w %zmm19, %zmm14, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
-; AVX512BW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2w %zmm19, %zmm17, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm19
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512BW-NEXT:    vpermt2w %zmm15, %zmm0, %zmm19
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm7
-; AVX512BW-NEXT:    vpermt2w %zmm15, %zmm4, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm27
-; AVX512BW-NEXT:    vpermt2w %zmm15, %zmm10, %zmm27
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm6
-; AVX512BW-NEXT:    vpermt2w %zmm15, %zmm12, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm8
-; AVX512BW-NEXT:    vpermt2w %zmm15, %zmm14, %zmm8
-; AVX512BW-NEXT:    vpermt2w %zmm15, %zmm17, %zmm28
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm3
-; AVX512BW-NEXT:    vpermt2w %zmm31, %zmm1, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm2
-; AVX512BW-NEXT:    vpermt2w %zmm31, %zmm4, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm15
-; AVX512BW-NEXT:    vpermt2w %zmm31, %zmm10, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm0
-; AVX512BW-NEXT:    vpermt2w %zmm31, %zmm12, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm0
-; AVX512BW-NEXT:    vpermt2w %zmm31, %zmm14, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2w %zmm31, %zmm17, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm31
-; AVX512BW-NEXT:    vpermt2w %zmm29, %zmm5, %zmm31
-; AVX512BW-NEXT:    vpermt2w %zmm29, %zmm1, %zmm16
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm18
-; AVX512BW-NEXT:    vpermt2w %zmm29, %zmm4, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm20
-; AVX512BW-NEXT:    vpermt2w %zmm29, %zmm10, %zmm20
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm21
-; AVX512BW-NEXT:    vpermt2w %zmm29, %zmm12, %zmm21
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm23
-; AVX512BW-NEXT:    vpermt2w %zmm29, %zmm14, %zmm23
-; AVX512BW-NEXT:    vpermt2w %zmm29, %zmm17, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm29
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm0
-; AVX512BW-NEXT:    vpermt2w %zmm22, %zmm5, %zmm29
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm13
-; AVX512BW-NEXT:    vpermt2w %zmm22, %zmm1, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm22
-; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm2, %zmm22
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm24
-; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm10, %zmm24
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm25
-; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm12, %zmm25
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm26
-; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm14, %zmm26
-; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm17, %zmm30
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2w %zmm11, %zmm0, %zmm5
-; AVX512BW-NEXT:    vpermi2w %zmm11, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpermi2w %zmm11, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpermi2w %zmm11, %zmm0, %zmm4
-; AVX512BW-NEXT:    vpermi2w %zmm11, %zmm0, %zmm12
-; AVX512BW-NEXT:    vpermi2w %zmm11, %zmm0, %zmm14
-; AVX512BW-NEXT:    vpermt2w %zmm11, %zmm17, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm29 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,1,9,17,25,33,41,49,57>
-; AVX512BW-NEXT:    vpermi2w %zmm31, %zmm11, %zmm0
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm29, %zmm17
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,2,10,18,26,34,42,50,58>
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2w %zmm29, %zmm5, %zmm10
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm19, %zmm19
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm13 {%k1}
-; AVX512BW-NEXT:    vpermi2w %zmm31, %zmm11, %zmm5
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm16
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm7 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,3,11,19,27,35,43,51,59>
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512BW-NEXT:    vpermt2w %zmm29, %zmm10, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm5
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm7, %zmm29
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm22 {%k1}
-; AVX512BW-NEXT:    vpermi2w %zmm31, %zmm11, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm0
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm22, %zmm22
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,4,12,20,28,36,44,52,60>
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm11
-; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm9, %zmm10
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,5,13,21,29,37,45,53,61>
-; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm13, %zmm3
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,6,14,22,30,38,46,54,62>
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm1
-; AVX512BW-NEXT:    vpermt2w %zmm11, %zmm5, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm18 = <u,u,u,u,u,u,u,u,7,15,23,31,39,47,55,63>
-; AVX512BW-NEXT:    vpermt2w %zmm11, %zmm18, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm11
-; AVX512BW-NEXT:    vpermi2w %zmm31, %zmm0, %zmm9
-; AVX512BW-NEXT:    vpermi2w %zmm31, %zmm0, %zmm13
-; AVX512BW-NEXT:    vpermi2w %zmm31, %zmm0, %zmm5
-; AVX512BW-NEXT:    vpermt2w %zmm31, %zmm18, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm27, %zmm2
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm24 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm24, %zmm9
-; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm0
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm25 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm25, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm8 {%k1}
-; AVX512BW-NEXT:    vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm26 {%k1}
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm26, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm28 {%k1}
-; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm28, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm30 {%k1}
-; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm30, %zmm6
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm7, 64(%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm7, (%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, 64(%rdx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm7, (%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, 64(%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, (%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, 64(%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, (%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, 64(%r9)
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, (%r9)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rax)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rax)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, (%rax)
-; AVX512BW-NEXT:    addq $1032, %rsp # imm = 0x408
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX512BW-ONLY-SLOW-LABEL: load_i16_stride8_vf64:
+; AVX512BW-ONLY-SLOW:       # %bb.0:
+; AVX512BW-ONLY-SLOW-NEXT:    subq $1032, %rsp # imm = 0x408
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 704(%rdi), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 640(%rdi), %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 576(%rdi), %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 512(%rdi), %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 832(%rdi), %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 768(%rdi), %zmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 960(%rdi), %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 896(%rdi), %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm19, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm15, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    movb $-64, %al
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %eax, %k1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm7, %zmm1, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm31, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm29, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm22, %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2w %zmm9, %zmm11, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2w %zmm2, %zmm6, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm19, %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm15, %zmm5, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm7, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm31, %zmm5, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm19, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm19, %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm19, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm19, %zmm12, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm19, %zmm14, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm19, %zmm17, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm15, %zmm0, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm15, %zmm4, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm15, %zmm10, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm15, %zmm12, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm15, %zmm14, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm15, %zmm17, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm31, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm31, %zmm4, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm31, %zmm10, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm31, %zmm12, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm31, %zmm14, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm31, %zmm17, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm29, %zmm5, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm29, %zmm1, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm29, %zmm4, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm29, %zmm10, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm29, %zmm12, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm29, %zmm14, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm29, %zmm17, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm22, %zmm5, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm22, %zmm1, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm0, %zmm2, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm0, %zmm10, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm0, %zmm12, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm0, %zmm14, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm0, %zmm17, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2w %zmm11, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2w %zmm11, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2w %zmm11, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2w %zmm11, %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2w %zmm11, %zmm0, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2w %zmm11, %zmm0, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm11, %zmm17, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm29 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2w %zmm31, %zmm11, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm29, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm29, %zmm5, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm19, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm13 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2w %zmm31, %zmm11, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm7 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm29, %zmm10, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm7, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm22 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2w %zmm31, %zmm11, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm22, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm5, %zmm9, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm5, %zmm13, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm11, %zmm5, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm18 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm11, %zmm18, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2w %zmm31, %zmm0, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2w %zmm31, %zmm0, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2w %zmm31, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2w %zmm31, %zmm18, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm27, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm24 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm24, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm25 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm25, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm26 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm26, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm28 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm28, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm30 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm30, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm7, 64(%rsi)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm7, (%rsi)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, 64(%rdx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm7, (%rdx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, 64(%rcx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, (%rcx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, 64(%r8)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, (%r8)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 64(%r9)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, (%r9)
+; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    addq $1032, %rsp # imm = 0x408
+; AVX512BW-ONLY-SLOW-NEXT:    vzeroupper
+; AVX512BW-ONLY-SLOW-NEXT:    retq
+;
+; AVX512BW-ONLY-FAST-LABEL: load_i16_stride8_vf64:
+; AVX512BW-ONLY-FAST:       # %bb.0:
+; AVX512BW-ONLY-FAST-NEXT:    subq $1032, %rsp # imm = 0x408
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 704(%rdi), %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 640(%rdi), %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 576(%rdi), %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 512(%rdi), %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 832(%rdi), %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 768(%rdi), %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 960(%rdi), %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 896(%rdi), %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm19, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm15, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    movb $-64, %al
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %eax, %k1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm7, %zmm1, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm31, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm29, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm22, %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2w %zmm9, %zmm11, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2w %zmm2, %zmm6, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm19, %zmm5, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm15, %zmm5, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm7, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm31, %zmm5, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm19, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm19, %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm19, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm19, %zmm12, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm19, %zmm14, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm19, %zmm17, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm15, %zmm0, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm15, %zmm4, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm15, %zmm10, %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm15, %zmm12, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm15, %zmm14, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm15, %zmm17, %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm31, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm31, %zmm4, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm31, %zmm10, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm31, %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm31, %zmm14, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm31, %zmm17, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm29, %zmm5, %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm29, %zmm1, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm29, %zmm4, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm29, %zmm10, %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm29, %zmm12, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm29, %zmm14, %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm29, %zmm17, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm22, %zmm5, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm22, %zmm1, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm0, %zmm2, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm0, %zmm10, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm0, %zmm12, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm0, %zmm14, %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm0, %zmm17, %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2w %zmm11, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2w %zmm11, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2w %zmm11, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2w %zmm11, %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2w %zmm11, %zmm0, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2w %zmm11, %zmm0, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm11, %zmm17, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm29 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2w %zmm31, %zmm11, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm29, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm29, %zmm5, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm19, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm13 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2w %zmm31, %zmm11, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm7 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm29, %zmm10, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm7, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm22 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2w %zmm31, %zmm11, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm22, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm5, %zmm9, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm5, %zmm13, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm11, %zmm5, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm18 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm11, %zmm18, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2w %zmm31, %zmm0, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2w %zmm31, %zmm0, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2w %zmm31, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2w %zmm31, %zmm18, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm27, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm24 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm24, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm25 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm25, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm26 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm26, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm28 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm28, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm30 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm30, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm7, 64(%rsi)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm7, (%rsi)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, 64(%rdx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm7, (%rdx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, 64(%rcx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, (%rcx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, 64(%r8)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, (%r8)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 64(%r9)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, (%r9)
+; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, (%rax)
+; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512BW-ONLY-FAST-NEXT:    addq $1032, %rsp # imm = 0x408
+; AVX512BW-ONLY-FAST-NEXT:    vzeroupper
+; AVX512BW-ONLY-FAST-NEXT:    retq
+;
+; AVX512DQBW-SLOW-LABEL: load_i16_stride8_vf64:
+; AVX512DQBW-SLOW:       # %bb.0:
+; AVX512DQBW-SLOW-NEXT:    subq $1032, %rsp # imm = 0x408
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 704(%rdi), %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 640(%rdi), %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 576(%rdi), %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 512(%rdi), %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 832(%rdi), %zmm22
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 768(%rdi), %zmm30
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 960(%rdi), %zmm29
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 896(%rdi), %zmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm31
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm28
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm3
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
+; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm19, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm15, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT:    movb $-64, %al
+; AVX512DQBW-SLOW-NEXT:    kmovd %eax, %k1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
+; AVX512DQBW-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm7, %zmm1, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm31, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm29, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm22, %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpermi2w %zmm9, %zmm11, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermi2w %zmm2, %zmm6, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
+; AVX512DQBW-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm19, %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm15, %zmm5, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
+; AVX512DQBW-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm7, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm31, %zmm5, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
+; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm19, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
+; AVX512DQBW-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm19, %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
+; AVX512DQBW-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm19, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
+; AVX512DQBW-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm19, %zmm12, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
+; AVX512DQBW-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm19, %zmm14, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
+; AVX512DQBW-SLOW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm19, %zmm17, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm15, %zmm0, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm15, %zmm4, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm15, %zmm10, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm15, %zmm12, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm15, %zmm14, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm15, %zmm17, %zmm28
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm31, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm31, %zmm4, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm31, %zmm10, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm31, %zmm12, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm31, %zmm14, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm31, %zmm17, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm31
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm29, %zmm5, %zmm31
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm29, %zmm1, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm29, %zmm4, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm29, %zmm10, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm29, %zmm12, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm29, %zmm14, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm29, %zmm17, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm22, %zmm5, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm22, %zmm1, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm0, %zmm2, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm0, %zmm10, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm0, %zmm12, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm26
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm0, %zmm14, %zmm26
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm0, %zmm17, %zmm30
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermi2w %zmm11, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermi2w %zmm11, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermi2w %zmm11, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermi2w %zmm11, %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermi2w %zmm11, %zmm0, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vpermi2w %zmm11, %zmm0, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm11, %zmm17, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm29 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
+; AVX512DQBW-SLOW-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vpermi2w %zmm31, %zmm11, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm29, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
+; AVX512DQBW-SLOW-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm29, %zmm5, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm19, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm13 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpermi2w %zmm31, %zmm11, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm7 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
+; AVX512DQBW-SLOW-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm29, %zmm10, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm7, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm22 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpermi2w %zmm31, %zmm11, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm22, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
+; AVX512DQBW-SLOW-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm5, %zmm9, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
+; AVX512DQBW-SLOW-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm5, %zmm13, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
+; AVX512DQBW-SLOW-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm11, %zmm5, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm18 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
+; AVX512DQBW-SLOW-NEXT:    # ymm18 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm11, %zmm18, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpermi2w %zmm31, %zmm0, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpermi2w %zmm31, %zmm0, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpermi2w %zmm31, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2w %zmm31, %zmm18, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm27, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm24 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm24, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm25 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm25, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm26 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm26, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm28 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm28, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm30 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm30, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm7, 64(%rsi)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm7, (%rsi)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, 64(%rdx)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm7, (%rdx)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, 64(%rcx)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, (%rcx)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, 64(%r8)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, (%r8)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, 64(%r9)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, (%r9)
+; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rax)
+; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, 64(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, 64(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512DQBW-SLOW-NEXT:    addq $1032, %rsp # imm = 0x408
+; AVX512DQBW-SLOW-NEXT:    vzeroupper
+; AVX512DQBW-SLOW-NEXT:    retq
+;
+; AVX512DQBW-FAST-LABEL: load_i16_stride8_vf64:
+; AVX512DQBW-FAST:       # %bb.0:
+; AVX512DQBW-FAST-NEXT:    subq $1032, %rsp # imm = 0x408
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 704(%rdi), %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 640(%rdi), %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 576(%rdi), %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 512(%rdi), %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 832(%rdi), %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 768(%rdi), %zmm30
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 960(%rdi), %zmm29
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 896(%rdi), %zmm16
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm31
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm28
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm19
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm3
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
+; AVX512DQBW-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm19, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm15, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT:    movb $-64, %al
+; AVX512DQBW-FAST-NEXT:    kmovd %eax, %k1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
+; AVX512DQBW-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm7, %zmm1, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm31, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm29, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm22, %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpermi2w %zmm9, %zmm11, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermi2w %zmm2, %zmm6, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
+; AVX512DQBW-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm19, %zmm5, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm15, %zmm5, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
+; AVX512DQBW-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm7, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm31, %zmm5, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
+; AVX512DQBW-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm19, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
+; AVX512DQBW-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm19, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
+; AVX512DQBW-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm19, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm10
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
+; AVX512DQBW-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm19, %zmm12, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
+; AVX512DQBW-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm19, %zmm14, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
+; AVX512DQBW-FAST-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm19, %zmm17, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm19
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm15, %zmm0, %zmm19
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm15, %zmm4, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm27
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm15, %zmm10, %zmm27
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm15, %zmm12, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm15, %zmm14, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm15, %zmm17, %zmm28
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm31, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm31, %zmm4, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm31, %zmm10, %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm31, %zmm12, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm31, %zmm14, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm31, %zmm17, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm31
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm29, %zmm5, %zmm31
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm29, %zmm1, %zmm16
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm18
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm29, %zmm4, %zmm18
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm20
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm29, %zmm10, %zmm20
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm29, %zmm12, %zmm21
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm23
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm29, %zmm14, %zmm23
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm29, %zmm17, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm29
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm22, %zmm5, %zmm29
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm22, %zmm1, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm22
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm0, %zmm2, %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm24
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm0, %zmm10, %zmm24
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm25
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm0, %zmm12, %zmm25
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm26
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm0, %zmm14, %zmm26
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm0, %zmm17, %zmm30
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermi2w %zmm11, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermi2w %zmm11, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermi2w %zmm11, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermi2w %zmm11, %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermi2w %zmm11, %zmm0, %zmm12
+; AVX512DQBW-FAST-NEXT:    vpermi2w %zmm11, %zmm0, %zmm14
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm11, %zmm17, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm29 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
+; AVX512DQBW-FAST-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vpermi2w %zmm31, %zmm11, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm29, %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
+; AVX512DQBW-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm29, %zmm5, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm19, %zmm19
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm13 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpermi2w %zmm31, %zmm11, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm16
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm7 {%k1}
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
+; AVX512DQBW-FAST-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm29, %zmm10, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm7, %zmm29
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm22 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpermi2w %zmm31, %zmm11, %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm22, %zmm22
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
+; AVX512DQBW-FAST-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm5, %zmm9, %zmm10
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
+; AVX512DQBW-FAST-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm5, %zmm13, %zmm3
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
+; AVX512DQBW-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm11, %zmm5, %zmm1
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm18 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
+; AVX512DQBW-FAST-NEXT:    # ymm18 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm11, %zmm18, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpermi2w %zmm31, %zmm0, %zmm9
+; AVX512DQBW-FAST-NEXT:    vpermi2w %zmm31, %zmm0, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpermi2w %zmm31, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2w %zmm31, %zmm18, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm27, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm24 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm24, %zmm9
+; AVX512DQBW-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm25 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm25, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm26 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm26, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm28 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm28, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm30 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm30, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm7, 64(%rsi)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm7, (%rsi)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, 64(%rdx)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm7, (%rdx)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, 64(%rcx)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, (%rcx)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, 64(%r8)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, (%r8)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, 64(%r9)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, (%r9)
+; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, (%rax)
+; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, 64(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, 64(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512DQBW-FAST-NEXT:    addq $1032, %rsp # imm = 0x408
+; AVX512DQBW-FAST-NEXT:    vzeroupper
+; AVX512DQBW-FAST-NEXT:    retq
   %wide.vec = load <512 x i16>, ptr %in.vec, align 64
   %strided.vec0 = shufflevector <512 x i16> %wide.vec, <512 x i16> poison, <64 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120, i32 128, i32 136, i32 144, i32 152, i32 160, i32 168, i32 176, i32 184, i32 192, i32 200, i32 208, i32 216, i32 224, i32 232, i32 240, i32 248, i32 256, i32 264, i32 272, i32 280, i32 288, i32 296, i32 304, i32 312, i32 320, i32 328, i32 336, i32 344, i32 352, i32 360, i32 368, i32 376, i32 384, i32 392, i32 400, i32 408, i32 416, i32 424, i32 432, i32 440, i32 448, i32 456, i32 464, i32 472, i32 480, i32 488, i32 496, i32 504>
   %strided.vec1 = shufflevector <512 x i16> %wide.vec, <512 x i16> poison, <64 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 65, i32 73, i32 81, i32 89, i32 97, i32 105, i32 113, i32 121, i32 129, i32 137, i32 145, i32 153, i32 161, i32 169, i32 177, i32 185, i32 193, i32 201, i32 209, i32 217, i32 225, i32 233, i32 241, i32 249, i32 257, i32 265, i32 273, i32 281, i32 289, i32 297, i32 305, i32 313, i32 321, i32 329, i32 337, i32 345, i32 353, i32 361, i32 369, i32 377, i32 385, i32 393, i32 401, i32 409, i32 417, i32 425, i32 433, i32 441, i32 449, i32 457, i32 465, i32 473, i32 481, i32 489, i32 497, i32 505>
@@ -9032,13 +9915,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW: {{.*}}
 ; AVX512: {{.*}}
 ; AVX512BW-FAST: {{.*}}
-; AVX512BW-ONLY-FAST: {{.*}}
-; AVX512BW-ONLY-SLOW: {{.*}}
 ; AVX512BW-SLOW: {{.*}}
 ; AVX512DQ-FAST: {{.*}}
 ; AVX512DQ-SLOW: {{.*}}
-; AVX512DQBW-FAST: {{.*}}
-; AVX512DQBW-SLOW: {{.*}}
 ; AVX512F: {{.*}}
 ; AVX512F-ONLY-FAST: {{.*}}
 ; AVX512F-ONLY-SLOW: {{.*}}

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
index a6e64e1d8f6d0..7d3f515e259e9 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
@@ -78,7 +78,7 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512F-FAST-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <5,0,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [5,0,5,0]
 ; AVX512F-FAST-NEXT:    vpermi2d %xmm0, %xmm1, %xmm3
 ; AVX512F-FAST-NEXT:    vpbroadcastd 8(%rdi), %xmm0
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
@@ -106,7 +106,7 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FAST-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512BW-FAST-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BW-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <5,0,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [5,0,5,0]
 ; AVX512BW-FAST-NEXT:    vpermi2d %xmm0, %xmm1, %xmm3
 ; AVX512BW-FAST-NEXT:    vpbroadcastd 8(%rdi), %xmm0
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
index 8b1aae61ed5c3..25254def1809e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
@@ -78,7 +78,7 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512F-FAST-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512F-FAST-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,5,1,5]
 ; AVX512F-FAST-NEXT:    vpermi2d %xmm1, %xmm0, %xmm3
 ; AVX512F-FAST-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512F-FAST-NEXT:    vmovq %xmm2, (%rsi)
@@ -106,7 +106,7 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FAST-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512BW-FAST-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BW-FAST-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,5,1,5]
 ; AVX512BW-FAST-NEXT:    vpermi2d %xmm1, %xmm0, %xmm3
 ; AVX512BW-FAST-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512BW-FAST-NEXT:    vmovq %xmm2, (%rsi)

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
index d28215f89fdc1..bcffe374f4b87 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
@@ -145,7 +145,7 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vpermi2d %xmm2, %xmm1, %xmm4
 ; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4]
 ; AVX512F-FAST-NEXT:    vpermi2d %xmm3, %xmm1, %xmm2
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <3,5,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [3,5,3,5]
 ; AVX512F-FAST-NEXT:    vpermi2d %xmm3, %xmm1, %xmm5
 ; AVX512F-FAST-NEXT:    vmovddup {{.*#+}} xmm1 = [4,2,4,2]
 ; AVX512F-FAST-NEXT:    # xmm1 = mem[0,0]
@@ -207,7 +207,7 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FAST-NEXT:    vpermi2d %xmm2, %xmm1, %xmm4
 ; AVX512BW-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4]
 ; AVX512BW-FAST-NEXT:    vpermi2d %xmm3, %xmm1, %xmm2
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <3,5,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [3,5,3,5]
 ; AVX512BW-FAST-NEXT:    vpermi2d %xmm3, %xmm1, %xmm5
 ; AVX512BW-FAST-NEXT:    vmovddup {{.*#+}} xmm1 = [4,2,4,2]
 ; AVX512BW-FAST-NEXT:    # xmm1 = mem[0,0]
@@ -367,13 +367,13 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm7[2,3]
 ; AVX2-ONLY-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,2]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm9 = <4,2,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [4,2,4,2]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vpermd %ymm1, %ymm9, %ymm2
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3]
 ; AVX2-ONLY-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = <5,3,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [5,3,5,3]
 ; AVX2-ONLY-NEXT:    vpermd %ymm1, %ymm7, %ymm1
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
 ; AVX2-ONLY-NEXT:    vmovdqa %xmm0, (%rsi)
@@ -901,12 +901,14 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11]
 ; AVX512-NEXT:    vpermi2d %ymm4, %ymm5, %ymm7
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,0,6,12>
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,12,0,0,6,12]
+; AVX512-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpermi2d %ymm1, %ymm0, %ymm4
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = <2,8,14,20,26,u,u,u>
 ; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,1,7,13>
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,1,7,13,0,1,7,13]
+; AVX512-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpermi2d %ymm1, %ymm0, %ymm5
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = <3,9,15,21,27,u,u,u>
 ; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
index f9713d1eab16c..a8b737fc81732 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
@@ -133,7 +133,7 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <4,11,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4,11,4,11]
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%rdi), %ymm5
 ; AVX512F-SLOW-NEXT:    vmovdqa (%rdi), %ymm6
 ; AVX512F-SLOW-NEXT:    vpermi2d %ymm5, %ymm6, %ymm1
@@ -164,9 +164,9 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vpermi2d %xmm1, %xmm0, %xmm3
 ; AVX512F-FAST-NEXT:    vpbroadcastd 8(%rdi), %xmm4
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <7,2,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [7,2,7,2]
 ; AVX512F-FAST-NEXT:    vpermi2d %xmm0, %xmm1, %xmm5
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <4,11,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [4,11,4,11]
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %ymm6
 ; AVX512F-FAST-NEXT:    vpermi2d %ymm1, %ymm6, %ymm0
@@ -201,7 +201,7 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
 ; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <4,11,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4,11,4,11]
 ; AVX512BW-SLOW-NEXT:    vmovdqa 32(%rdi), %ymm5
 ; AVX512BW-SLOW-NEXT:    vmovdqa (%rdi), %ymm6
 ; AVX512BW-SLOW-NEXT:    vpermi2d %ymm5, %ymm6, %ymm1
@@ -232,9 +232,9 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FAST-NEXT:    vpermi2d %xmm1, %xmm0, %xmm3
 ; AVX512BW-FAST-NEXT:    vpbroadcastd 8(%rdi), %xmm4
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <7,2,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [7,2,7,2]
 ; AVX512BW-FAST-NEXT:    vpermi2d %xmm0, %xmm1, %xmm5
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <4,11,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [4,11,4,11]
 ; AVX512BW-FAST-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; AVX512BW-FAST-NEXT:    vmovdqa (%rdi), %ymm6
 ; AVX512BW-FAST-NEXT:    vpermi2d %ymm1, %ymm6, %ymm0
@@ -911,7 +911,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vpbroadcastd 100(%rdi), %xmm10
 ; AVX2-SLOW-NEXT:    vmovdqa 64(%rdi), %xmm11
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm12 = <4,3,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [4,3,4,3]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpermd %ymm13, %ymm12, %ymm12
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
@@ -986,7 +986,8 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7]
 ; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [5,6,5,6,5,6,5,6]
 ; AVX2-FAST-NEXT:    vpermd %ymm7, %ymm8, %ymm7
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <1,0,7,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [1,0,7,0,1,0,7,0]
+; AVX2-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
 ; AVX2-FAST-NEXT:    vpermd %ymm8, %ymm11, %ymm11
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7]
@@ -1014,7 +1015,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpbroadcastd 100(%rdi), %xmm10
 ; AVX2-FAST-NEXT:    vmovdqa 64(%rdi), %xmm11
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = <4,3,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [4,3,4,3]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FAST-NEXT:    vpermd %ymm13, %ymm12, %ymm12
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
@@ -1025,7 +1026,8 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,1,0,7,u>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,0,7,0,1,0,7,0]
+; AVX2-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm4, %ymm5, %ymm4
 ; AVX2-FAST-NEXT:    vpbroadcastd 216(%rdi), %ymm5
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
@@ -1117,7 +1119,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd 100(%rdi), %xmm10
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 64(%rdi), %xmm11
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm12 = <4,3,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [4,3,4,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm13, %ymm12, %ymm12
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
@@ -1169,37 +1171,44 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm1
 ; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm2
 ; AVX512-NEXT:    vmovdqa64 192(%rdi), %zmm3
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,3,10,17>
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,3,10,17,0,3,10,17]
+; AVX512-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = <0,7,14,21,28,u,u,u>
 ; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,4,11,18>
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,4,11,18,0,4,11,18]
+; AVX512-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm6 = <1,8,15,22,29,u,u,u>
 ; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,5,12,19>
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [0,5,12,19,0,5,12,19]
+; AVX512-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = <18,25,0,7,14,u,u,u>
 ; AVX512-NEXT:    vpermi2d %zmm0, %zmm1, %zmm7
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,6,13,20>
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,6,13,20,0,6,13,20]
+; AVX512-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm7
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = <19,26,1,8,15,u,u,u>
 ; AVX512-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,0,7,14,21>
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,7,14,21,0,7,14,21]
+; AVX512-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm9 = [4,11,18,25]
 ; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm9
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,1,8,15,22>
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [1,8,15,22,1,8,15,22]
+; AVX512-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm9
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm10 = [5,12,19,26]
 ; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm10
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,2,9,16,23>
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [2,9,16,23,2,9,16,23]
+; AVX512-NEXT:    # ymm10 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm10
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,13,20,27]
 ; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
@@ -1923,7 +1932,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vpbroadcastd 100(%rdi), %xmm0
 ; AVX2-SLOW-NEXT:    vmovdqa 64(%rdi), %xmm3
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = <4,3,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [4,3,4,3]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7]
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm7, %ymm12
 ; AVX2-SLOW-NEXT:    vpermd %ymm10, %ymm5, %ymm10
@@ -2149,7 +2158,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpbroadcastd 100(%rdi), %xmm0
 ; AVX2-FAST-NEXT:    vmovdqa 64(%rdi), %xmm1
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <4,3,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [4,3,4,3]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm12[4,5,6,7]
 ; AVX2-FAST-NEXT:    vmovdqa %ymm6, %ymm8
 ; AVX2-FAST-NEXT:    vpermd %ymm4, %ymm3, %ymm4
@@ -2376,7 +2385,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd 100(%rdi), %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 64(%rdi), %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm5 = <4,3,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [4,3,4,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm7, %ymm12
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm10, %ymm5, %ymm10
@@ -4815,7 +4824,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpbroadcastd 100(%rdi), %xmm0
 ; AVX2-FAST-NEXT:    vmovdqa 64(%rdi), %xmm2
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <4,3,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [4,3,4,3]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
 ; AVX2-FAST-NEXT:    # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7]

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
index 5f6de8f4b8bf4..b161f24e01f2f 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
@@ -153,7 +153,7 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%rdi), %xmm1
 ; AVX512F-FAST-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,5,1,5]
 ; AVX512F-FAST-NEXT:    vpermi2d %xmm1, %xmm0, %xmm3
 ; AVX512F-FAST-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%rdi), %ymm1
@@ -220,7 +220,7 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FAST-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512BW-FAST-NEXT:    vmovdqa 32(%rdi), %xmm1
 ; AVX512BW-FAST-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,5,1,5]
 ; AVX512BW-FAST-NEXT:    vpermi2d %xmm1, %xmm0, %xmm3
 ; AVX512BW-FAST-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512BW-FAST-NEXT:    vmovdqa 32(%rdi), %ymm1
@@ -817,42 +817,50 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm1
 ; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm2
 ; AVX512-NEXT:    vmovdqa64 192(%rdi), %zmm3
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,0,8,16,24>
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,0,8,16,24]
+; AVX512-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,8,16,24]
 ; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,1,9,17,25>
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,1,9,17,25]
+; AVX512-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = [1,9,17,25]
 ; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,2,10,18,26>
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,2,10,18,26]
+; AVX512-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm7 = [2,10,18,26]
 ; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,3,11,19,27>
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,3,11,19,27]
+; AVX512-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm7
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm8 = [3,11,19,27]
 ; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,4,12,20,28>
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,4,12,20,28]
+; AVX512-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm9 = [4,12,20,28]
 ; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm9
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,5,13,21,29>
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29]
+; AVX512-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm9
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm10 = [5,13,21,29]
 ; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm10
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,6,14,22,30>
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,6,14,22,30]
+; AVX512-NEXT:    # ymm10 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm10
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm11 = [6,14,22,30]
 ; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm11
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,u,7,15,23,31>
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,7,15,23,31]
+; AVX512-NEXT:    # ymm11 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm11
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,15,23,31]
 ; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
@@ -1694,7 +1702,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    movb $-64, %dil
 ; AVX512F-NEXT:    kmovw %edi, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,0,8,16,24>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [0,8,16,24,0,8,16,24]
+; AVX512F-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpermi2d %zmm4, %zmm3, %zmm9
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
@@ -1706,7 +1715,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm11
 ; AVX512F-NEXT:    vpermt2d %zmm5, %zmm9, %zmm11
 ; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm11 {%k1}
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,1,9,17,25>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [1,9,17,25,1,9,17,25]
+; AVX512F-NEXT:    # ymm10 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpermi2d %zmm4, %zmm3, %zmm10
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm9
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
@@ -1718,7 +1728,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm12
 ; AVX512F-NEXT:    vpermt2d %zmm5, %zmm10, %zmm12
 ; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm12 {%k1}
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,u,2,10,18,26>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [2,10,18,26,2,10,18,26]
+; AVX512F-NEXT:    # ymm11 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpermi2d %zmm4, %zmm3, %zmm11
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm10
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
@@ -1730,7 +1741,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm13
 ; AVX512F-NEXT:    vpermt2d %zmm5, %zmm11, %zmm13
 ; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm13 {%k1}
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,u,u,3,11,19,27>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [3,11,19,27,3,11,19,27]
+; AVX512F-NEXT:    # ymm12 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpermi2d %zmm4, %zmm3, %zmm12
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm11
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
@@ -1742,7 +1754,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm14
 ; AVX512F-NEXT:    vpermt2d %zmm5, %zmm12, %zmm14
 ; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,4,12,20,28>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [4,12,20,28,4,12,20,28]
+; AVX512F-NEXT:    # ymm13 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpermi2d %zmm4, %zmm3, %zmm13
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm12
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
@@ -1754,7 +1767,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm15
 ; AVX512F-NEXT:    vpermt2d %zmm5, %zmm13, %zmm15
 ; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,5,13,21,29>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [5,13,21,29,5,13,21,29]
+; AVX512F-NEXT:    # ymm14 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpermi2d %zmm4, %zmm3, %zmm14
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm13
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
@@ -1766,7 +1780,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm16
 ; AVX512F-NEXT:    vpermt2d %zmm5, %zmm14, %zmm16
 ; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm16 {%k1}
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,u,u,6,14,22,30>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [6,14,22,30,6,14,22,30]
+; AVX512F-NEXT:    # ymm15 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpermi2d %zmm4, %zmm3, %zmm15
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm14
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
@@ -1776,7 +1791,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vpermt2d %zmm7, %zmm15, %zmm6
 ; AVX512F-NEXT:    vpermt2d %zmm5, %zmm15, %zmm2
 ; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm2 {%k1}
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,7,15,23,31>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [7,15,23,31,7,15,23,31]
+; AVX512F-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpermi2d %zmm4, %zmm3, %zmm5
 ; AVX512F-NEXT:    vpermt2d %zmm1, %zmm15, %zmm0
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
@@ -1814,7 +1830,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    movb $-64, %dil
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,0,8,16,24>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [0,8,16,24,0,8,16,24]
+; AVX512BW-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm3, %zmm9
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
@@ -1826,7 +1843,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm11
 ; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm9, %zmm11
 ; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm11 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,1,9,17,25>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [1,9,17,25,1,9,17,25]
+; AVX512BW-NEXT:    # ymm10 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm3, %zmm10
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm9
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
@@ -1838,7 +1856,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm12
 ; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm10, %zmm12
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm12 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,u,2,10,18,26>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [2,10,18,26,2,10,18,26]
+; AVX512BW-NEXT:    # ymm11 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm3, %zmm11
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm10
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
@@ -1850,7 +1869,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm13
 ; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm11, %zmm13
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm13 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,u,u,3,11,19,27>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [3,11,19,27,3,11,19,27]
+; AVX512BW-NEXT:    # ymm12 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm3, %zmm12
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm11
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
@@ -1862,7 +1882,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm14
 ; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm12, %zmm14
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,4,12,20,28>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [4,12,20,28,4,12,20,28]
+; AVX512BW-NEXT:    # ymm13 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm3, %zmm13
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm12
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
@@ -1874,7 +1895,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm15
 ; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm13, %zmm15
 ; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,5,13,21,29>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [5,13,21,29,5,13,21,29]
+; AVX512BW-NEXT:    # ymm14 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm3, %zmm14
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm13
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
@@ -1886,7 +1908,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm16
 ; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm14, %zmm16
 ; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm16 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,u,u,6,14,22,30>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [6,14,22,30,6,14,22,30]
+; AVX512BW-NEXT:    # ymm15 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm3, %zmm15
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm14
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
@@ -1896,7 +1919,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm15, %zmm6
 ; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm15, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,7,15,23,31>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [7,15,23,31,7,15,23,31]
+; AVX512BW-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm3, %zmm5
 ; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm15, %zmm0
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
@@ -3727,559 +3751,2293 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;
-; AVX512F-LABEL: load_i32_stride8_vf32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    subq $1032, %rsp # imm = 0x408
-; AVX512F-NEXT:    vmovdqa64 704(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 640(%rdi), %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 576(%rdi), %zmm9
-; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 512(%rdi), %zmm11
-; AVX512F-NEXT:    vmovdqa64 832(%rdi), %zmm22
-; AVX512F-NEXT:    vmovdqa64 768(%rdi), %zmm30
-; AVX512F-NEXT:    vmovdqa64 960(%rdi), %zmm29
-; AVX512F-NEXT:    vmovdqa64 896(%rdi), %zmm16
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm13
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm31
-; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm5
-; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm7
-; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm15
-; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm28
-; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm19
-; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm3
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
-; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm10
-; AVX512F-NEXT:    vpermt2d %zmm19, %zmm0, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm3
-; AVX512F-NEXT:    vpermt2d %zmm15, %zmm0, %zmm3
-; AVX512F-NEXT:    movb $-64, %al
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,0,8,16,24>
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2d %zmm7, %zmm1, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm5
-; AVX512F-NEXT:    vpermt2d %zmm31, %zmm0, %zmm5
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm3
-; AVX512F-NEXT:    vpermt2d %zmm29, %zmm0, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm4
-; AVX512F-NEXT:    vpermt2d %zmm22, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-NEXT:    vpermi2d %zmm9, %zmm11, %zmm0
-; AVX512F-NEXT:    vpermi2d %zmm2, %zmm6, %zmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
-; AVX512F-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512F-NEXT:    vpermt2d %zmm19, %zmm5, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm1
-; AVX512F-NEXT:    vpermt2d %zmm15, %zmm5, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,1,9,17,25>
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512F-NEXT:    vpermt2d %zmm7, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm3
-; AVX512F-NEXT:    vpermt2d %zmm31, %zmm5, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
-; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm1
-; AVX512F-NEXT:    vpermt2d %zmm19, %zmm0, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
-; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm1
-; AVX512F-NEXT:    vpermt2d %zmm19, %zmm2, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
-; AVX512F-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm3
-; AVX512F-NEXT:    vpermt2d %zmm19, %zmm1, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm10
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
-; AVX512F-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512F-NEXT:    vpermt2d %zmm19, %zmm12, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
-; AVX512F-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512F-NEXT:    vpermt2d %zmm19, %zmm14, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
-; AVX512F-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2d %zmm19, %zmm17, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm19
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512F-NEXT:    vpermt2d %zmm15, %zmm0, %zmm19
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm7
-; AVX512F-NEXT:    vpermt2d %zmm15, %zmm4, %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm27
-; AVX512F-NEXT:    vpermt2d %zmm15, %zmm10, %zmm27
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm6
-; AVX512F-NEXT:    vpermt2d %zmm15, %zmm12, %zmm6
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm8
-; AVX512F-NEXT:    vpermt2d %zmm15, %zmm14, %zmm8
-; AVX512F-NEXT:    vpermt2d %zmm15, %zmm17, %zmm28
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm3
-; AVX512F-NEXT:    vpermt2d %zmm31, %zmm1, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm9
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm2
-; AVX512F-NEXT:    vpermt2d %zmm31, %zmm4, %zmm9
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm15
-; AVX512F-NEXT:    vpermt2d %zmm31, %zmm10, %zmm15
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm0
-; AVX512F-NEXT:    vpermt2d %zmm31, %zmm12, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm0
-; AVX512F-NEXT:    vpermt2d %zmm31, %zmm14, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2d %zmm31, %zmm17, %zmm13
-; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm31
-; AVX512F-NEXT:    vpermt2d %zmm29, %zmm5, %zmm31
-; AVX512F-NEXT:    vpermt2d %zmm29, %zmm1, %zmm16
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm18
-; AVX512F-NEXT:    vpermt2d %zmm29, %zmm4, %zmm18
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm20
-; AVX512F-NEXT:    vpermt2d %zmm29, %zmm10, %zmm20
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm21
-; AVX512F-NEXT:    vpermt2d %zmm29, %zmm12, %zmm21
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm23
-; AVX512F-NEXT:    vpermt2d %zmm29, %zmm14, %zmm23
-; AVX512F-NEXT:    vpermt2d %zmm29, %zmm17, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm29
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm0
-; AVX512F-NEXT:    vpermt2d %zmm22, %zmm5, %zmm29
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm13
-; AVX512F-NEXT:    vpermt2d %zmm22, %zmm1, %zmm13
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm22
-; AVX512F-NEXT:    vpermt2d %zmm0, %zmm2, %zmm22
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm24
-; AVX512F-NEXT:    vpermt2d %zmm0, %zmm10, %zmm24
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm25
-; AVX512F-NEXT:    vpermt2d %zmm0, %zmm12, %zmm25
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm26
-; AVX512F-NEXT:    vpermt2d %zmm0, %zmm14, %zmm26
-; AVX512F-NEXT:    vpermt2d %zmm0, %zmm17, %zmm30
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm0
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2d %zmm11, %zmm0, %zmm5
-; AVX512F-NEXT:    vpermi2d %zmm11, %zmm0, %zmm1
-; AVX512F-NEXT:    vpermi2d %zmm11, %zmm0, %zmm2
-; AVX512F-NEXT:    vpermi2d %zmm11, %zmm0, %zmm4
-; AVX512F-NEXT:    vpermi2d %zmm11, %zmm0, %zmm12
-; AVX512F-NEXT:    vpermi2d %zmm11, %zmm0, %zmm14
-; AVX512F-NEXT:    vpermt2d %zmm11, %zmm17, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm29 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,1,9,17,25>
-; AVX512F-NEXT:    vpermi2d %zmm31, %zmm11, %zmm0
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm29, %zmm17
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,2,10,18,26>
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm10
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm29, %zmm5, %zmm10
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm19, %zmm19
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm13 {%k1}
-; AVX512F-NEXT:    vpermi2d %zmm31, %zmm11, %zmm5
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm16
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm7 {%k1}
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,3,11,19,27>
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm13
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512F-NEXT:    vpermt2d %zmm29, %zmm10, %zmm13
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm5
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm7, %zmm29
-; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm22 {%k1}
-; AVX512F-NEXT:    vpermi2d %zmm31, %zmm11, %zmm10
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm0
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm22, %zmm22
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,4,12,20,28>
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm10
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm11
-; AVX512F-NEXT:    vpermt2d %zmm5, %zmm9, %zmm10
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,5,13,21,29>
-; AVX512F-NEXT:    vpermt2d %zmm5, %zmm13, %zmm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,6,14,22,30>
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm1
-; AVX512F-NEXT:    vpermt2d %zmm11, %zmm5, %zmm1
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm18 = <u,u,u,u,7,15,23,31>
-; AVX512F-NEXT:    vpermt2d %zmm11, %zmm18, %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm11
-; AVX512F-NEXT:    vpermi2d %zmm31, %zmm0, %zmm9
-; AVX512F-NEXT:    vpermi2d %zmm31, %zmm0, %zmm13
-; AVX512F-NEXT:    vpermi2d %zmm31, %zmm0, %zmm5
-; AVX512F-NEXT:    vpermt2d %zmm31, %zmm18, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm27, %zmm2
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm24 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm9, %zmm24, %zmm9
-; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm0
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm25 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm25, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm8 {%k1}
-; AVX512F-NEXT:    vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm26 {%k1}
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm26, %zmm4
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm28 {%k1}
-; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm28, %zmm5
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm30 {%k1}
-; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm30, %zmm6
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm7, 64(%rsi)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm7, (%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm17, 64(%rdx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm7, (%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm16, 64(%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm19, (%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm22, 64(%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm29, (%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm9, 64(%r9)
-; AVX512F-NEXT:    vmovdqa64 %zmm2, (%r9)
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm3, 64(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm0, (%rax)
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm4, 64(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm1, (%rax)
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm6, 64(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm5, (%rax)
-; AVX512F-NEXT:    addq $1032, %rsp # imm = 0x408
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
+; AVX512F-ONLY-SLOW-LABEL: load_i32_stride8_vf32:
+; AVX512F-ONLY-SLOW:       # %bb.0:
+; AVX512F-ONLY-SLOW-NEXT:    subq $1032, %rsp # imm = 0x408
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 704(%rdi), %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 640(%rdi), %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 576(%rdi), %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 512(%rdi), %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 832(%rdi), %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 768(%rdi), %zmm30
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 960(%rdi), %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 896(%rdi), %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm31
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    movb $-64, %al
+; AVX512F-ONLY-SLOW-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm7, %zmm1, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2d %zmm9, %zmm11, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2d %zmm2, %zmm6, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm19, %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm15, %zmm5, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm7, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm31, %zmm5, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm19, %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm19, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm19, %zmm12, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm19, %zmm14, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm19, %zmm17, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm15, %zmm4, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm15, %zmm10, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm15, %zmm12, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm15, %zmm14, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm15, %zmm17, %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm31, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm31, %zmm4, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm31, %zmm10, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm31, %zmm12, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm31, %zmm14, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm31, %zmm17, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm31
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm5, %zmm31
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm1, %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm4, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm10, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm12, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm14, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm17, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm22, %zmm5, %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm22, %zmm1, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm0, %zmm2, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm0, %zmm10, %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm0, %zmm12, %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm0, %zmm14, %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm0, %zmm17, %zmm30
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2d %zmm11, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2d %zmm11, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2d %zmm11, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2d %zmm11, %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2d %zmm11, %zmm0, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2d %zmm11, %zmm0, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm11, %zmm17, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm29 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2d %zmm31, %zmm11, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm29, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm5, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm19, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm13 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2d %zmm31, %zmm11, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm7 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm10, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm7, %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm22 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2d %zmm31, %zmm11, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm22, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm5, %zmm9, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm5, %zmm13, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm11, %zmm5, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm11, %zmm18, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2d %zmm31, %zmm0, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2d %zmm31, %zmm0, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2d %zmm31, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm31, %zmm18, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm27, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm24 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm24, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm25 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm25, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm26 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm26, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm28 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm28, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm30 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm30, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm7, 64(%rsi)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm7, (%rsi)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, 64(%rdx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm7, (%rdx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, 64(%rcx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, (%rcx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, 64(%r8)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, (%r8)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 64(%r9)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, (%r9)
+; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, (%rax)
+; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512F-ONLY-SLOW-NEXT:    addq $1032, %rsp # imm = 0x408
+; AVX512F-ONLY-SLOW-NEXT:    vzeroupper
+; AVX512F-ONLY-SLOW-NEXT:    retq
 ;
-; AVX512BW-LABEL: load_i32_stride8_vf32:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    subq $1032, %rsp # imm = 0x408
-; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm9
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm11
-; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm22
-; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm30
-; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm29
-; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm16
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm13
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm31
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm7
-; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm15
-; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm28
-; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm19
-; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm3
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10
-; AVX512BW-NEXT:    vpermt2d %zmm19, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm3
-; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm3
-; AVX512BW-NEXT:    movb $-64, %al
-; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,0,8,16,24>
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm1, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm5
-; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm0, %zmm5
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm3
-; AVX512BW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm4
-; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-NEXT:    vpermi2d %zmm9, %zmm11, %zmm0
-; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm6, %zmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
-; AVX512BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512BW-NEXT:    vpermt2d %zmm19, %zmm5, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm1
-; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm5, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,1,9,17,25>
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm3
-; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm5, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm1
-; AVX512BW-NEXT:    vpermt2d %zmm19, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
-; AVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm1
-; AVX512BW-NEXT:    vpermt2d %zmm19, %zmm2, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
-; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm3
-; AVX512BW-NEXT:    vpermt2d %zmm19, %zmm1, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm10
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
-; AVX512BW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512BW-NEXT:    vpermt2d %zmm19, %zmm12, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
-; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512BW-NEXT:    vpermt2d %zmm19, %zmm14, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
-; AVX512BW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2d %zmm19, %zmm17, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm19
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm19
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm7
-; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm4, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm27
-; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm10, %zmm27
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm6
-; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm12, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm8
-; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm14, %zmm8
-; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm17, %zmm28
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm3
-; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm1, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm2
-; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm4, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm15
-; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm10, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm0
-; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm12, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm0
-; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm14, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm17, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm31
-; AVX512BW-NEXT:    vpermt2d %zmm29, %zmm5, %zmm31
-; AVX512BW-NEXT:    vpermt2d %zmm29, %zmm1, %zmm16
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm18
-; AVX512BW-NEXT:    vpermt2d %zmm29, %zmm4, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm20
-; AVX512BW-NEXT:    vpermt2d %zmm29, %zmm10, %zmm20
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm21
-; AVX512BW-NEXT:    vpermt2d %zmm29, %zmm12, %zmm21
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm23
-; AVX512BW-NEXT:    vpermt2d %zmm29, %zmm14, %zmm23
-; AVX512BW-NEXT:    vpermt2d %zmm29, %zmm17, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm29
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm0
-; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm5, %zmm29
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm13
-; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm1, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm22
-; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm2, %zmm22
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm24
-; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm10, %zmm24
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm25
-; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm12, %zmm25
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm26
-; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm14, %zmm26
-; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm17, %zmm30
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2d %zmm11, %zmm0, %zmm5
-; AVX512BW-NEXT:    vpermi2d %zmm11, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpermi2d %zmm11, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpermi2d %zmm11, %zmm0, %zmm4
-; AVX512BW-NEXT:    vpermi2d %zmm11, %zmm0, %zmm12
-; AVX512BW-NEXT:    vpermi2d %zmm11, %zmm0, %zmm14
-; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm17, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm29 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,1,9,17,25>
-; AVX512BW-NEXT:    vpermi2d %zmm31, %zmm11, %zmm0
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm29, %zmm17
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,2,10,18,26>
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm29, %zmm5, %zmm10
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm19, %zmm19
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm13 {%k1}
-; AVX512BW-NEXT:    vpermi2d %zmm31, %zmm11, %zmm5
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm16
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm7 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,3,11,19,27>
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512BW-NEXT:    vpermt2d %zmm29, %zmm10, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm5
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm7, %zmm29
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm22 {%k1}
-; AVX512BW-NEXT:    vpermi2d %zmm31, %zmm11, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm0
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm22, %zmm22
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,4,12,20,28>
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm11
-; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm9, %zmm10
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,5,13,21,29>
-; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm13, %zmm3
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,6,14,22,30>
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm1
-; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm5, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm18 = <u,u,u,u,7,15,23,31>
-; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm18, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm11
-; AVX512BW-NEXT:    vpermi2d %zmm31, %zmm0, %zmm9
-; AVX512BW-NEXT:    vpermi2d %zmm31, %zmm0, %zmm13
-; AVX512BW-NEXT:    vpermi2d %zmm31, %zmm0, %zmm5
-; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm18, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm27, %zmm2
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm24 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm24, %zmm9
-; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm0
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm25 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm25, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm8 {%k1}
-; AVX512BW-NEXT:    vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm26 {%k1}
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm26, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm28 {%k1}
-; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm28, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm30 {%k1}
-; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm30, %zmm6
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm7, 64(%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm7, (%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, 64(%rdx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm7, (%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, 64(%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, (%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, 64(%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, (%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, 64(%r9)
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, (%r9)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rax)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rax)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, (%rax)
-; AVX512BW-NEXT:    addq $1032, %rsp # imm = 0x408
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX512F-ONLY-FAST-LABEL: load_i32_stride8_vf32:
+; AVX512F-ONLY-FAST:       # %bb.0:
+; AVX512F-ONLY-FAST-NEXT:    subq $1032, %rsp # imm = 0x408
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 704(%rdi), %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 640(%rdi), %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 576(%rdi), %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 512(%rdi), %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 832(%rdi), %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 768(%rdi), %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 960(%rdi), %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 896(%rdi), %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm31
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512F-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm15, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    movb $-64, %al
+; AVX512F-ONLY-FAST-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24]
+; AVX512F-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm7, %zmm1, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm22, %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm9, %zmm11, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm2, %zmm6, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512F-ONLY-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm19, %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm15, %zmm5, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25]
+; AVX512F-ONLY-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm7, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm31, %zmm5, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512F-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512F-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm19, %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512F-ONLY-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm19, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512F-ONLY-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm19, %zmm12, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512F-ONLY-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm19, %zmm14, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512F-ONLY-FAST-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm19, %zmm17, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm15, %zmm0, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm15, %zmm4, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm15, %zmm10, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm15, %zmm12, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm15, %zmm14, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm15, %zmm17, %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm31, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm31, %zmm4, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm31, %zmm10, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm31, %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm31, %zmm14, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm31, %zmm17, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm31
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm5, %zmm31
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm1, %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm4, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm10, %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm12, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm14, %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm17, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm22, %zmm5, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm22, %zmm1, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm0, %zmm2, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm0, %zmm10, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm0, %zmm12, %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm0, %zmm14, %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm0, %zmm17, %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm11, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm11, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm11, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm11, %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm11, %zmm0, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm11, %zmm0, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm11, %zmm17, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm29 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25]
+; AVX512F-ONLY-FAST-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm31, %zmm11, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm29, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26]
+; AVX512F-ONLY-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm5, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm19, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm13 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm31, %zmm11, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm7 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27]
+; AVX512F-ONLY-FAST-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm10, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm7, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm22 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm31, %zmm11, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm22, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28]
+; AVX512F-ONLY-FAST-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm5, %zmm9, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29]
+; AVX512F-ONLY-FAST-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm5, %zmm13, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30]
+; AVX512F-ONLY-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm11, %zmm5, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31]
+; AVX512F-ONLY-FAST-NEXT:    # ymm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm11, %zmm18, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm31, %zmm0, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm31, %zmm0, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm31, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm31, %zmm18, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm27, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm24 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm24, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm25 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm25, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm26 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm26, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm28 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm28, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm30 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm30, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm7, 64(%rsi)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm7, (%rsi)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, 64(%rdx)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm7, (%rdx)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, 64(%rcx)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, (%rcx)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, 64(%r8)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, (%r8)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 64(%r9)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, (%r9)
+; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, (%rax)
+; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512F-ONLY-FAST-NEXT:    addq $1032, %rsp # imm = 0x408
+; AVX512F-ONLY-FAST-NEXT:    vzeroupper
+; AVX512F-ONLY-FAST-NEXT:    retq
+;
+; AVX512DQ-SLOW-LABEL: load_i32_stride8_vf32:
+; AVX512DQ-SLOW:       # %bb.0:
+; AVX512DQ-SLOW-NEXT:    subq $1032, %rsp # imm = 0x408
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 704(%rdi), %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 640(%rdi), %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 576(%rdi), %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 512(%rdi), %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 832(%rdi), %zmm22
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 768(%rdi), %zmm30
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 960(%rdi), %zmm29
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 896(%rdi), %zmm16
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm31
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm28
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm19
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT:    movb $-64, %al
+; AVX512DQ-SLOW-NEXT:    kmovw %eax, %k1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24]
+; AVX512DQ-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm7, %zmm1, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpermi2d %zmm9, %zmm11, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermi2d %zmm2, %zmm6, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512DQ-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm19, %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm15, %zmm5, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25]
+; AVX512DQ-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm7, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm31, %zmm5, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512DQ-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm19, %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512DQ-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm19, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm10
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512DQ-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm19, %zmm12, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512DQ-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm19, %zmm14, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512DQ-SLOW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm19, %zmm17, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm19
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm19
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm15, %zmm4, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm27
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm15, %zmm10, %zmm27
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm15, %zmm12, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm15, %zmm14, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm15, %zmm17, %zmm28
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm31, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm31, %zmm4, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm31, %zmm10, %zmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm31, %zmm12, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm31, %zmm14, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm31, %zmm17, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm31
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm29, %zmm5, %zmm31
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm29, %zmm1, %zmm16
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm18
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm29, %zmm4, %zmm18
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm20
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm29, %zmm10, %zmm20
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm29, %zmm12, %zmm21
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm23
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm29, %zmm14, %zmm23
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm29, %zmm17, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm29
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm22, %zmm5, %zmm29
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm13
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm22, %zmm1, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm22
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm0, %zmm2, %zmm22
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm24
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm0, %zmm10, %zmm24
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm25
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm0, %zmm12, %zmm25
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm26
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm0, %zmm14, %zmm26
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm0, %zmm17, %zmm30
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermi2d %zmm11, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermi2d %zmm11, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermi2d %zmm11, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermi2d %zmm11, %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermi2d %zmm11, %zmm0, %zmm12
+; AVX512DQ-SLOW-NEXT:    vpermi2d %zmm11, %zmm0, %zmm14
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm11, %zmm17, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm29 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25]
+; AVX512DQ-SLOW-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vpermi2d %zmm31, %zmm11, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm29, %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26]
+; AVX512DQ-SLOW-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm29, %zmm5, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm19, %zmm19
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm13 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpermi2d %zmm31, %zmm11, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm16
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm7 {%k1}
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27]
+; AVX512DQ-SLOW-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm29, %zmm10, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm7, %zmm29
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm22 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpermi2d %zmm31, %zmm11, %zmm10
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm22, %zmm22
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28]
+; AVX512DQ-SLOW-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm5, %zmm9, %zmm10
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29]
+; AVX512DQ-SLOW-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm5, %zmm13, %zmm3
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30]
+; AVX512DQ-SLOW-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm11, %zmm5, %zmm1
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31]
+; AVX512DQ-SLOW-NEXT:    # ymm18 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm11, %zmm18, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpermi2d %zmm31, %zmm0, %zmm9
+; AVX512DQ-SLOW-NEXT:    vpermi2d %zmm31, %zmm0, %zmm13
+; AVX512DQ-SLOW-NEXT:    vpermi2d %zmm31, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm31, %zmm18, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm27, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm24 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm24, %zmm9
+; AVX512DQ-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm25 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm25, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm26 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm26, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm28 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm28, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm30 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm30, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm7, 64(%rsi)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm7, (%rsi)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, 64(%rdx)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm7, (%rdx)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, 64(%rcx)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, (%rcx)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, 64(%r8)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, (%r8)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, 64(%r9)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, (%r9)
+; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, (%rax)
+; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, 64(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, 64(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512DQ-SLOW-NEXT:    addq $1032, %rsp # imm = 0x408
+; AVX512DQ-SLOW-NEXT:    vzeroupper
+; AVX512DQ-SLOW-NEXT:    retq
+;
+; AVX512DQ-FAST-LABEL: load_i32_stride8_vf32:
+; AVX512DQ-FAST:       # %bb.0:
+; AVX512DQ-FAST-NEXT:    subq $1032, %rsp # imm = 0x408
+; AVX512DQ-FAST-NEXT:    vmovdqa64 704(%rdi), %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 640(%rdi), %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 576(%rdi), %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 512(%rdi), %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqa64 832(%rdi), %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqa64 768(%rdi), %zmm30
+; AVX512DQ-FAST-NEXT:    vmovdqa64 960(%rdi), %zmm29
+; AVX512DQ-FAST-NEXT:    vmovdqa64 896(%rdi), %zmm16
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdi), %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm31
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm15
+; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm28
+; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm19
+; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512DQ-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm15, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT:    movb $-64, %al
+; AVX512DQ-FAST-NEXT:    kmovw %eax, %k1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24]
+; AVX512DQ-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm7, %zmm1, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm29, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm22, %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT:    vpermi2d %zmm9, %zmm11, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermi2d %zmm2, %zmm6, %zmm1
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512DQ-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm19, %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm15, %zmm5, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25]
+; AVX512DQ-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm7, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm31, %zmm5, %zmm3
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512DQ-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512DQ-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm19, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512DQ-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm19, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm10
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512DQ-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm19, %zmm12, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512DQ-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm19, %zmm14, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512DQ-FAST-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm19, %zmm17, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm19
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm15, %zmm0, %zmm19
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm15, %zmm4, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm27
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm15, %zmm10, %zmm27
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm15, %zmm12, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm15, %zmm14, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm15, %zmm17, %zmm28
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm31, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm31, %zmm4, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm31, %zmm10, %zmm15
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm31, %zmm12, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm31, %zmm14, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm31, %zmm17, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm31
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm29, %zmm5, %zmm31
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm29, %zmm1, %zmm16
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm18
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm29, %zmm4, %zmm18
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm20
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm29, %zmm10, %zmm20
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm29, %zmm12, %zmm21
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm23
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm29, %zmm14, %zmm23
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm29, %zmm17, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, %zmm29
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm22, %zmm5, %zmm29
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, %zmm13
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm22, %zmm1, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, %zmm22
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm0, %zmm2, %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, %zmm24
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm0, %zmm10, %zmm24
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, %zmm25
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm0, %zmm12, %zmm25
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, %zmm26
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm0, %zmm14, %zmm26
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm0, %zmm17, %zmm30
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermi2d %zmm11, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermi2d %zmm11, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermi2d %zmm11, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermi2d %zmm11, %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermi2d %zmm11, %zmm0, %zmm12
+; AVX512DQ-FAST-NEXT:    vpermi2d %zmm11, %zmm0, %zmm14
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm11, %zmm17, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm29 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25]
+; AVX512DQ-FAST-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vpermi2d %zmm31, %zmm11, %zmm0
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm29, %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26]
+; AVX512DQ-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm29, %zmm5, %zmm10
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm19, %zmm19
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm13 {%k1}
+; AVX512DQ-FAST-NEXT:    vpermi2d %zmm31, %zmm11, %zmm5
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm16
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm7 {%k1}
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27]
+; AVX512DQ-FAST-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm29, %zmm10, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, %zmm5
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm7, %zmm29
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm22 {%k1}
+; AVX512DQ-FAST-NEXT:    vpermi2d %zmm31, %zmm11, %zmm10
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm22, %zmm22
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28]
+; AVX512DQ-FAST-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm11
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm5, %zmm9, %zmm10
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29]
+; AVX512DQ-FAST-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm5, %zmm13, %zmm3
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30]
+; AVX512DQ-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm11, %zmm5, %zmm1
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31]
+; AVX512DQ-FAST-NEXT:    # ymm18 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm11, %zmm18, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm11
+; AVX512DQ-FAST-NEXT:    vpermi2d %zmm31, %zmm0, %zmm9
+; AVX512DQ-FAST-NEXT:    vpermi2d %zmm31, %zmm0, %zmm13
+; AVX512DQ-FAST-NEXT:    vpermi2d %zmm31, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm31, %zmm18, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm27, %zmm2
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm24 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm24, %zmm9
+; AVX512DQ-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm25 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm25, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8 {%k1}
+; AVX512DQ-FAST-NEXT:    vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm26 {%k1}
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm26, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm28 {%k1}
+; AVX512DQ-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm28, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm30 {%k1}
+; AVX512DQ-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm30, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm7, 64(%rsi)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm7, (%rsi)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, 64(%rdx)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm7, (%rdx)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, 64(%rcx)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, (%rcx)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, 64(%r8)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, (%r8)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, 64(%r9)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, (%r9)
+; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, (%rax)
+; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, 64(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, 64(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512DQ-FAST-NEXT:    addq $1032, %rsp # imm = 0x408
+; AVX512DQ-FAST-NEXT:    vzeroupper
+; AVX512DQ-FAST-NEXT:    retq
+;
+; AVX512BW-ONLY-SLOW-LABEL: load_i32_stride8_vf32:
+; AVX512BW-ONLY-SLOW:       # %bb.0:
+; AVX512BW-ONLY-SLOW-NEXT:    subq $1032, %rsp # imm = 0x408
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 704(%rdi), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 640(%rdi), %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 576(%rdi), %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 512(%rdi), %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 832(%rdi), %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 768(%rdi), %zmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 960(%rdi), %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 896(%rdi), %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    movb $-64, %al
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %eax, %k1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm7, %zmm1, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2d %zmm9, %zmm11, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2d %zmm2, %zmm6, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm19, %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm15, %zmm5, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm7, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm31, %zmm5, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm19, %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm19, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm19, %zmm12, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm19, %zmm14, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm19, %zmm17, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm15, %zmm4, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm15, %zmm10, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm15, %zmm12, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm15, %zmm14, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm15, %zmm17, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm31, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm31, %zmm4, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm31, %zmm10, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm31, %zmm12, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm31, %zmm14, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm31, %zmm17, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm5, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm1, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm4, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm10, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm12, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm14, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm17, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm22, %zmm5, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm22, %zmm1, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm0, %zmm2, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm0, %zmm10, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm0, %zmm12, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm0, %zmm14, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm0, %zmm17, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2d %zmm11, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2d %zmm11, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2d %zmm11, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2d %zmm11, %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2d %zmm11, %zmm0, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2d %zmm11, %zmm0, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm11, %zmm17, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm29 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2d %zmm31, %zmm11, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm29, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm5, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm19, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm13 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2d %zmm31, %zmm11, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm7 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm10, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm7, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm22 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2d %zmm31, %zmm11, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm22, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm5, %zmm9, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm5, %zmm13, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm11, %zmm5, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm11, %zmm18, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2d %zmm31, %zmm0, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2d %zmm31, %zmm0, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2d %zmm31, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm31, %zmm18, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm27, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm24 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm24, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm25 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm25, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm26 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm26, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm28 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm28, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm30 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm30, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm7, 64(%rsi)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm7, (%rsi)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, 64(%rdx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm7, (%rdx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, 64(%rcx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, (%rcx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, 64(%r8)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, (%r8)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 64(%r9)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, (%r9)
+; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    addq $1032, %rsp # imm = 0x408
+; AVX512BW-ONLY-SLOW-NEXT:    vzeroupper
+; AVX512BW-ONLY-SLOW-NEXT:    retq
+;
+; AVX512BW-ONLY-FAST-LABEL: load_i32_stride8_vf32:
+; AVX512BW-ONLY-FAST:       # %bb.0:
+; AVX512BW-ONLY-FAST-NEXT:    subq $1032, %rsp # imm = 0x408
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 704(%rdi), %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 640(%rdi), %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 576(%rdi), %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 512(%rdi), %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 832(%rdi), %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 768(%rdi), %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 960(%rdi), %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 896(%rdi), %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm15, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    movb $-64, %al
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %eax, %k1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm7, %zmm1, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm22, %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2d %zmm9, %zmm11, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2d %zmm2, %zmm6, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm19, %zmm5, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm15, %zmm5, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm7, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm31, %zmm5, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm19, %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm19, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm19, %zmm12, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm19, %zmm14, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm19, %zmm17, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm15, %zmm0, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm15, %zmm4, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm15, %zmm10, %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm15, %zmm12, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm15, %zmm14, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm15, %zmm17, %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm31, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm31, %zmm4, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm31, %zmm10, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm31, %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm31, %zmm14, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm31, %zmm17, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm5, %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm1, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm4, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm10, %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm12, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm14, %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm17, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm22, %zmm5, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm22, %zmm1, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm0, %zmm2, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm0, %zmm10, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm0, %zmm12, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm0, %zmm14, %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm0, %zmm17, %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2d %zmm11, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2d %zmm11, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2d %zmm11, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2d %zmm11, %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2d %zmm11, %zmm0, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2d %zmm11, %zmm0, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm11, %zmm17, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm29 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2d %zmm31, %zmm11, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm29, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm5, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm19, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm13 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2d %zmm31, %zmm11, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm7 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm10, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm7, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm22 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2d %zmm31, %zmm11, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm22, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm5, %zmm9, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm5, %zmm13, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm11, %zmm5, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm11, %zmm18, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2d %zmm31, %zmm0, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2d %zmm31, %zmm0, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2d %zmm31, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm31, %zmm18, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm27, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm24 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm24, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm25 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm25, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm26 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm26, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm28 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm28, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm30 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm30, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm7, 64(%rsi)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm7, (%rsi)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, 64(%rdx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm7, (%rdx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, 64(%rcx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, (%rcx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, 64(%r8)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, (%r8)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 64(%r9)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, (%r9)
+; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, (%rax)
+; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512BW-ONLY-FAST-NEXT:    addq $1032, %rsp # imm = 0x408
+; AVX512BW-ONLY-FAST-NEXT:    vzeroupper
+; AVX512BW-ONLY-FAST-NEXT:    retq
+;
+; AVX512DQBW-SLOW-LABEL: load_i32_stride8_vf32:
+; AVX512DQBW-SLOW:       # %bb.0:
+; AVX512DQBW-SLOW-NEXT:    subq $1032, %rsp # imm = 0x408
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 704(%rdi), %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 640(%rdi), %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 576(%rdi), %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 512(%rdi), %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 832(%rdi), %zmm22
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 768(%rdi), %zmm30
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 960(%rdi), %zmm29
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 896(%rdi), %zmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm31
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm28
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm3
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT:    movb $-64, %al
+; AVX512DQBW-SLOW-NEXT:    kmovd %eax, %k1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24]
+; AVX512DQBW-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm7, %zmm1, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpermi2d %zmm9, %zmm11, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermi2d %zmm2, %zmm6, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512DQBW-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm19, %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm15, %zmm5, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25]
+; AVX512DQBW-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm7, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm31, %zmm5, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512DQBW-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm19, %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512DQBW-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm19, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512DQBW-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm19, %zmm12, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512DQBW-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm19, %zmm14, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512DQBW-SLOW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm19, %zmm17, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm15, %zmm4, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm15, %zmm10, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm15, %zmm12, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm15, %zmm14, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm15, %zmm17, %zmm28
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm31, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm31, %zmm4, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm31, %zmm10, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm31, %zmm12, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm31, %zmm14, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm31, %zmm17, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm31
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm29, %zmm5, %zmm31
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm29, %zmm1, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm29, %zmm4, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm29, %zmm10, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm29, %zmm12, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm29, %zmm14, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm29, %zmm17, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm22, %zmm5, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm22, %zmm1, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm0, %zmm2, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm0, %zmm10, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm0, %zmm12, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm26
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm0, %zmm14, %zmm26
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm0, %zmm17, %zmm30
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermi2d %zmm11, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermi2d %zmm11, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermi2d %zmm11, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermi2d %zmm11, %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermi2d %zmm11, %zmm0, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vpermi2d %zmm11, %zmm0, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm11, %zmm17, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm29 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25]
+; AVX512DQBW-SLOW-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vpermi2d %zmm31, %zmm11, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm29, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26]
+; AVX512DQBW-SLOW-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm29, %zmm5, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm19, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm13 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpermi2d %zmm31, %zmm11, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm7 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27]
+; AVX512DQBW-SLOW-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm29, %zmm10, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm7, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm22 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpermi2d %zmm31, %zmm11, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm22, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28]
+; AVX512DQBW-SLOW-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm5, %zmm9, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29]
+; AVX512DQBW-SLOW-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm5, %zmm13, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30]
+; AVX512DQBW-SLOW-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm11, %zmm5, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31]
+; AVX512DQBW-SLOW-NEXT:    # ymm18 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm11, %zmm18, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpermi2d %zmm31, %zmm0, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpermi2d %zmm31, %zmm0, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpermi2d %zmm31, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm31, %zmm18, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm27, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm24 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm24, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm25 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm25, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm26 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm26, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm28 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm28, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm30 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm30, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm7, 64(%rsi)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm7, (%rsi)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, 64(%rdx)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm7, (%rdx)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, 64(%rcx)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, (%rcx)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, 64(%r8)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, (%r8)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, 64(%r9)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, (%r9)
+; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rax)
+; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, 64(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, 64(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512DQBW-SLOW-NEXT:    addq $1032, %rsp # imm = 0x408
+; AVX512DQBW-SLOW-NEXT:    vzeroupper
+; AVX512DQBW-SLOW-NEXT:    retq
+;
+; AVX512DQBW-FAST-LABEL: load_i32_stride8_vf32:
+; AVX512DQBW-FAST:       # %bb.0:
+; AVX512DQBW-FAST-NEXT:    subq $1032, %rsp # imm = 0x408
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 704(%rdi), %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 640(%rdi), %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 576(%rdi), %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 512(%rdi), %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 832(%rdi), %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 768(%rdi), %zmm30
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 960(%rdi), %zmm29
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 896(%rdi), %zmm16
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm31
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm28
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm19
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm3
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512DQBW-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm15, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT:    movb $-64, %al
+; AVX512DQBW-FAST-NEXT:    kmovd %eax, %k1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24]
+; AVX512DQBW-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm7, %zmm1, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm29, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm22, %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpermi2d %zmm9, %zmm11, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermi2d %zmm2, %zmm6, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512DQBW-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm19, %zmm5, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm15, %zmm5, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25]
+; AVX512DQBW-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm7, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm31, %zmm5, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512DQBW-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512DQBW-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm19, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512DQBW-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm19, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm10
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512DQBW-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm19, %zmm12, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512DQBW-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm19, %zmm14, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512DQBW-FAST-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm19, %zmm17, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm19
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm15, %zmm0, %zmm19
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm15, %zmm4, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm27
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm15, %zmm10, %zmm27
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm15, %zmm12, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm15, %zmm14, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm15, %zmm17, %zmm28
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm31, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm31, %zmm4, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm31, %zmm10, %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm31, %zmm12, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm31, %zmm14, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm31, %zmm17, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm31
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm29, %zmm5, %zmm31
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm29, %zmm1, %zmm16
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm18
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm29, %zmm4, %zmm18
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm20
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm29, %zmm10, %zmm20
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm29, %zmm12, %zmm21
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm23
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm29, %zmm14, %zmm23
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm29, %zmm17, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm29
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm22, %zmm5, %zmm29
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm22, %zmm1, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm22
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm0, %zmm2, %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm24
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm0, %zmm10, %zmm24
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm25
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm0, %zmm12, %zmm25
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm26
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm0, %zmm14, %zmm26
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm0, %zmm17, %zmm30
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermi2d %zmm11, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermi2d %zmm11, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermi2d %zmm11, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermi2d %zmm11, %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermi2d %zmm11, %zmm0, %zmm12
+; AVX512DQBW-FAST-NEXT:    vpermi2d %zmm11, %zmm0, %zmm14
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm11, %zmm17, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm29 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25]
+; AVX512DQBW-FAST-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vpermi2d %zmm31, %zmm11, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm29, %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26]
+; AVX512DQBW-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm29, %zmm5, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm19, %zmm19
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm13 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpermi2d %zmm31, %zmm11, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm16
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm7 {%k1}
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27]
+; AVX512DQBW-FAST-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm29, %zmm10, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm7, %zmm29
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm22 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpermi2d %zmm31, %zmm11, %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm22, %zmm22
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28]
+; AVX512DQBW-FAST-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm5, %zmm9, %zmm10
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29]
+; AVX512DQBW-FAST-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm5, %zmm13, %zmm3
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30]
+; AVX512DQBW-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm11, %zmm5, %zmm1
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31]
+; AVX512DQBW-FAST-NEXT:    # ymm18 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm11, %zmm18, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpermi2d %zmm31, %zmm0, %zmm9
+; AVX512DQBW-FAST-NEXT:    vpermi2d %zmm31, %zmm0, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpermi2d %zmm31, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm31, %zmm18, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm27, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm24 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm24, %zmm9
+; AVX512DQBW-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm25 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm25, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm26 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm26, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm28 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm28, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm30 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm30, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm7, 64(%rsi)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm7, (%rsi)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, 64(%rdx)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm7, (%rdx)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, 64(%rcx)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, (%rcx)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, 64(%r8)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, (%r8)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, 64(%r9)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, (%r9)
+; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, (%rax)
+; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, 64(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, 64(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512DQBW-FAST-NEXT:    addq $1032, %rsp # imm = 0x408
+; AVX512DQBW-FAST-NEXT:    vzeroupper
+; AVX512DQBW-FAST-NEXT:    retq
   %wide.vec = load <256 x i32>, ptr %in.vec, align 64
   %strided.vec0 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120, i32 128, i32 136, i32 144, i32 152, i32 160, i32 168, i32 176, i32 184, i32 192, i32 200, i32 208, i32 216, i32 224, i32 232, i32 240, i32 248>
   %strided.vec1 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <32 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 65, i32 73, i32 81, i32 89, i32 97, i32 105, i32 113, i32 121, i32 129, i32 137, i32 145, i32 153, i32 161, i32 169, i32 177, i32 185, i32 193, i32 201, i32 209, i32 217, i32 225, i32 233, i32 241, i32 249>
@@ -8145,1193 +9903,4829 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;
-; AVX512F-LABEL: load_i32_stride8_vf64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    subq $3304, %rsp # imm = 0xCE8
-; AVX512F-NEXT:    vmovdqa64 1984(%rdi), %zmm14
-; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1216(%rdi), %zmm22
-; AVX512F-NEXT:    vmovdqa64 1152(%rdi), %zmm6
-; AVX512F-NEXT:    vmovdqa64 1088(%rdi), %zmm28
-; AVX512F-NEXT:    vmovdqa64 1024(%rdi), %zmm11
-; AVX512F-NEXT:    vmovdqa64 1344(%rdi), %zmm27
-; AVX512F-NEXT:    vmovdqa64 1280(%rdi), %zmm25
-; AVX512F-NEXT:    vmovdqa64 1472(%rdi), %zmm20
-; AVX512F-NEXT:    vmovdqa64 1408(%rdi), %zmm19
-; AVX512F-NEXT:    vmovdqa64 704(%rdi), %zmm8
-; AVX512F-NEXT:    vmovdqa64 640(%rdi), %zmm9
-; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 576(%rdi), %zmm24
-; AVX512F-NEXT:    vmovdqa64 512(%rdi), %zmm10
-; AVX512F-NEXT:    vmovdqa64 832(%rdi), %zmm26
-; AVX512F-NEXT:    vmovdqa64 768(%rdi), %zmm31
-; AVX512F-NEXT:    vmovdqa64 960(%rdi), %zmm17
-; AVX512F-NEXT:    vmovdqa64 896(%rdi), %zmm7
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm13
-; AVX512F-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm15
-; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm12
-; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm5
-; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
-; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2d %zmm4, %zmm0, %zmm1
-; AVX512F-NEXT:    vpermt2d %zmm3, %zmm0, %zmm2
-; AVX512F-NEXT:    movb $-64, %al
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,0,8,16,24>
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm4
-; AVX512F-NEXT:    vpermt2d %zmm5, %zmm3, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm30
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm5
-; AVX512F-NEXT:    vpermt2d %zmm15, %zmm0, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm21
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm23
-; AVX512F-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm15
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm4
-; AVX512F-NEXT:    vpermt2d %zmm26, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm2
-; AVX512F-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm7
-; AVX512F-NEXT:    vpermt2d %zmm8, %zmm3, %zmm5
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm2
-; AVX512F-NEXT:    vpermt2d %zmm20, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm4
-; AVX512F-NEXT:    vpermt2d %zmm27, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm2
-; AVX512F-NEXT:    vpermt2d %zmm28, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm5
-; AVX512F-NEXT:    vpermt2d %zmm22, %zmm3, %zmm5
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 1920(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512F-NEXT:    vpermt2d %zmm14, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 1856(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqa64 1792(%rdi), %zmm12
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm5
-; AVX512F-NEXT:    vpermt2d %zmm4, %zmm0, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm29
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512F-NEXT:    vmovdqa64 1600(%rdi), %zmm16
-; AVX512F-NEXT:    vmovdqa64 1536(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2d %zmm16, %zmm1, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1728(%rdi), %zmm9
-; AVX512F-NEXT:    vmovdqa64 1664(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2d %zmm9, %zmm1, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
-; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512F-NEXT:    vpermt2d %zmm27, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512F-NEXT:    vpermt2d %zmm28, %zmm0, %zmm11
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm14
-; AVX512F-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,1,9,17,25>
-; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm3
-; AVX512F-NEXT:    vpermt2d %zmm22, %zmm1, %zmm8
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm2
-; AVX512F-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2d %zmm26, %zmm0, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm2
-; AVX512F-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm7, %zmm1, %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm18
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm5
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm22, %zmm0, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2d %zmm30, %zmm1, %zmm2
-; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm8
-; AVX512F-NEXT:    vpermt2d %zmm21, %zmm0, %zmm8
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm7, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm5
-; AVX512F-NEXT:    vpermt2d %zmm29, %zmm0, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2d %zmm16, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2d %zmm9, %zmm2, %zmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
-; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2d %zmm20, %zmm0, %zmm19
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512F-NEXT:    vpermt2d %zmm27, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm2 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm14, %zmm0, %zmm5
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,2,10,18,26>
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm8
-; AVX512F-NEXT:    vpermt2d %zmm3, %zmm1, %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm27
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm9
-; AVX512F-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2d %zmm15, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2d %zmm26, %zmm0, %zmm31
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm31 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm16, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm8
-; AVX512F-NEXT:    vpermt2d %zmm18, %zmm1, %zmm8
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm31, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm23
-; AVX512F-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm17
-; AVX512F-NEXT:    vpermt2d %zmm22, %zmm0, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512F-NEXT:    vpermt2d %zmm30, %zmm1, %zmm4
-; AVX512F-NEXT:    vpermt2d %zmm21, %zmm0, %zmm13
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm3
-; AVX512F-NEXT:    vpermt2d %zmm7, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm19
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm6
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm5
-; AVX512F-NEXT:    vpermt2d %zmm29, %zmm0, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm31
-; AVX512F-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2d %zmm13, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2d %zmm4, %zmm2, %zmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
-; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512F-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,3,11,19,27>
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm27, %zmm1, %zmm8
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2d %zmm15, %zmm0, %zmm9
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm5
-; AVX512F-NEXT:    vpermt2d %zmm26, %zmm0, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm24
-; AVX512F-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm5 {%k1}
-; AVX512F-NEXT:    vpermt2d %zmm16, %zmm0, %zmm10
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm8
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm30, %zmm1, %zmm8
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm11
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm2
-; AVX512F-NEXT:    vpermt2d %zmm23, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm28
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm5
-; AVX512F-NEXT:    vpermt2d %zmm22, %zmm0, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm23
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm14, %zmm1, %zmm2
-; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm8
-; AVX512F-NEXT:    vpermt2d %zmm21, %zmm0, %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm27
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm26
-; AVX512F-NEXT:    vpermt2d %zmm19, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm29
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm22
-; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2d %zmm31, %zmm0, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2d %zmm13, %zmm3, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm21
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2d %zmm4, %zmm15, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
-; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512F-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm5
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm16, %zmm0, %zmm5
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,4,12,20,28>
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm19, %zmm1, %zmm8
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512F-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm5
-; AVX512F-NEXT:    vpermt2d %zmm24, %zmm0, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm8
-; AVX512F-NEXT:    vpermt2d %zmm30, %zmm1, %zmm8
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm2
-; AVX512F-NEXT:    vpermt2d %zmm28, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm5
-; AVX512F-NEXT:    vpermt2d %zmm23, %zmm0, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm2
-; AVX512F-NEXT:    vpermt2d %zmm14, %zmm1, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm8
-; AVX512F-NEXT:    vpermt2d %zmm27, %zmm0, %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm9
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm23
-; AVX512F-NEXT:    vpermt2d %zmm29, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm27, %zmm0, %zmm22
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm22 {%k1}
-; AVX512F-NEXT:    vpermi2d %zmm21, %zmm3, %zmm0
-; AVX512F-NEXT:    vpermi2d %zmm6, %zmm15, %zmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
-; AVX512F-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512F-NEXT:    vpermt2d %zmm20, %zmm3, %zmm0
-; AVX512F-NEXT:    vpermt2d %zmm18, %zmm3, %zmm25
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm2
-; AVX512F-NEXT:    vpermt2d %zmm16, %zmm3, %zmm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,5,13,21,29>
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512F-NEXT:    vpermt2d %zmm19, %zmm1, %zmm8
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm25, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm0
-; AVX512F-NEXT:    vpermt2d %zmm24, %zmm3, %zmm0
-; AVX512F-NEXT:    vpermt2d %zmm30, %zmm1, %zmm7
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm16
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm0, %zmm3, %zmm16
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
-; AVX512F-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm4
-; AVX512F-NEXT:    vpermt2d %zmm0, %zmm5, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
-; AVX512F-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2d %zmm0, %zmm25, %zmm31
-; AVX512F-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm7
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm0, %zmm3, %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm19
-; AVX512F-NEXT:    vpermt2d %zmm0, %zmm5, %zmm19
-; AVX512F-NEXT:    vpermt2d %zmm0, %zmm25, %zmm30
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm4
-; AVX512F-NEXT:    vpermt2d %zmm24, %zmm5, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2d %zmm24, %zmm25, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm18
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm26
-; AVX512F-NEXT:    vpermt2d %zmm28, %zmm3, %zmm26
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm17
-; AVX512F-NEXT:    vpermt2d %zmm28, %zmm5, %zmm17
-; AVX512F-NEXT:    vpermt2d %zmm28, %zmm25, %zmm18
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm28
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm0, %zmm3, %zmm28
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm11
-; AVX512F-NEXT:    vpermt2d %zmm0, %zmm5, %zmm11
-; AVX512F-NEXT:    vpermt2d %zmm0, %zmm25, %zmm21
-; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm15
-; AVX512F-NEXT:    vpermt2d %zmm9, %zmm3, %zmm15
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm8
-; AVX512F-NEXT:    vpermt2d %zmm9, %zmm5, %zmm8
-; AVX512F-NEXT:    vpermt2d %zmm9, %zmm25, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm22
-; AVX512F-NEXT:    vpermt2d %zmm29, %zmm3, %zmm22
-; AVX512F-NEXT:    vpermt2d %zmm29, %zmm5, %zmm23
-; AVX512F-NEXT:    vpermt2d %zmm29, %zmm25, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm29
-; AVX512F-NEXT:    vpermt2d %zmm27, %zmm3, %zmm29
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm24
-; AVX512F-NEXT:    vpermt2d %zmm27, %zmm5, %zmm24
-; AVX512F-NEXT:    vpermt2d %zmm27, %zmm25, %zmm13
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2d %zmm12, %zmm0, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm9
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm31
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm27, %zmm5, %zmm31
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm10
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm20, %zmm5, %zmm10
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm2, %zmm5, %zmm4
-; AVX512F-NEXT:    vpermi2d %zmm12, %zmm0, %zmm5
-; AVX512F-NEXT:    vpermt2d %zmm12, %zmm25, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2d %zmm27, %zmm25, %zmm9
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm27
-; AVX512F-NEXT:    vpermt2d %zmm20, %zmm25, %zmm6
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm20
-; AVX512F-NEXT:    vpermt2d %zmm2, %zmm25, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm7 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm28 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm12
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,5,13,21,29>
-; AVX512F-NEXT:    vpermt2d %zmm26, %zmm1, %zmm12
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm12, %zmm28, %zmm28
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm29 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2d %zmm22, %zmm0, %zmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm29, %zmm29
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,6,14,22,30>
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm15
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm2, %zmm3, %zmm15
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm16 = <u,u,u,u,7,15,23,31>
-; AVX512F-NEXT:    vpermt2d %zmm2, %zmm16, %zmm9
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm7
-; AVX512F-NEXT:    vpermt2d %zmm26, %zmm3, %zmm7
-; AVX512F-NEXT:    vpermt2d %zmm26, %zmm16, %zmm14
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm1, %zmm3, %zmm2
-; AVX512F-NEXT:    vpermi2d %zmm22, %zmm0, %zmm3
-; AVX512F-NEXT:    vpermt2d %zmm22, %zmm16, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm6
-; AVX512F-NEXT:    vpermt2d %zmm1, %zmm16, %zmm12
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm10 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm10, %zmm1
-; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm19, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm11 {%k1}
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm11, %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm24 {%k1}
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm24, %zmm0
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm30 {%k1}
-; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm30, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm21 {%k1}
-; AVX512F-NEXT:    vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm21, %zmm9
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm13 {%k1}
-; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm13, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm20 {%k1}
-; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm20, %zmm8
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm5, 192(%rsi)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm5, 128(%rsi)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm5, 64(%rsi)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm5, (%rsi)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm5, 192(%rdx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm5, (%rdx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm5, 64(%rdx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm5, 128(%rdx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm5, 192(%rcx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm5, (%rcx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm5, 64(%rcx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm5, 128(%rcx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm5, 192(%r8)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm5, (%r8)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm5, 64(%r8)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm5, 128(%r8)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm5, 192(%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm5, (%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm5, 64(%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm5, 128(%r9)
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm29, 192(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm28, (%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm25, 64(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm5, 128(%rax)
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm0, 192(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm7, (%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm2, 64(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm1, 128(%rax)
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm8, 128(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm4, 192(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm9, (%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm3, 64(%rax)
-; AVX512F-NEXT:    addq $3304, %rsp # imm = 0xCE8
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
+; AVX512F-ONLY-SLOW-LABEL: load_i32_stride8_vf64:
+; AVX512F-ONLY-SLOW:       # %bb.0:
+; AVX512F-ONLY-SLOW-NEXT:    subq $3304, %rsp # imm = 0xCE8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1984(%rdi), %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1216(%rdi), %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1152(%rdi), %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1088(%rdi), %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1024(%rdi), %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1344(%rdi), %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1280(%rdi), %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1472(%rdi), %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1408(%rdi), %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 704(%rdi), %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 640(%rdi), %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 576(%rdi), %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 512(%rdi), %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 832(%rdi), %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 768(%rdi), %zmm31
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 960(%rdi), %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 896(%rdi), %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    movb $-64, %al
+; AVX512F-ONLY-SLOW-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm5, %zmm3, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm30
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm26, %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm8, %zmm3, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm22, %zmm3, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1920(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm14, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1856(%rdi), %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1792(%rdi), %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1600(%rdi), %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1536(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2d %zmm16, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1728(%rdi), %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1664(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2d %zmm9, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm28, %zmm0, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm22, %zmm1, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm7, %zmm1, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm30, %zmm1, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2d %zmm16, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2d %zmm9, %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm2 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm14, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm3, %zmm1, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm26, %zmm0, %zmm31
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm31 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm18, %zmm1, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm30, %zmm1, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm31
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2d %zmm13, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2d %zmm4, %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm27, %zmm1, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm19, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2d %zmm13, %zmm3, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2d %zmm4, %zmm15, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm22 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2d %zmm21, %zmm3, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2d %zmm6, %zmm15, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm20, %zmm3, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm18, %zmm3, %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm16, %zmm3, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm25, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm24, %zmm3, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm30, %zmm1, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm0, %zmm3, %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm0, %zmm5, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm0, %zmm25, %zmm31
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm0, %zmm3, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm0, %zmm5, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm0, %zmm25, %zmm30
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm24, %zmm5, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm24, %zmm25, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm28, %zmm3, %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm28, %zmm5, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm28, %zmm25, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm0, %zmm3, %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm0, %zmm5, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm0, %zmm25, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm9, %zmm3, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm9, %zmm5, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm9, %zmm25, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm3, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm5, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm25, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm27, %zmm3, %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm27, %zmm5, %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm27, %zmm25, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2d %zmm12, %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm31
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm27, %zmm5, %zmm31
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm20, %zmm5, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm2, %zmm5, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2d %zmm12, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm12, %zmm25, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm27, %zmm25, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm20, %zmm25, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm2, %zmm25, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm7 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm28 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm26, %zmm1, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm28, %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm29 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2d %zmm22, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm29, %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm2, %zmm3, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm2, %zmm16, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm26, %zmm3, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm26, %zmm16, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm1, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2d %zmm22, %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm22, %zmm16, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm1, %zmm16, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm10 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm10, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm19, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm11 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm11, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm24 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm24, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm30 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm30, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm21 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm21, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm13 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm13, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm20 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm20, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm5, 192(%rsi)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm5, 128(%rsi)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm5, 64(%rsi)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm5, (%rsi)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm5, 192(%rdx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm5, (%rdx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm5, 64(%rdx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm5, 128(%rdx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm5, 192(%rcx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm5, (%rcx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm5, 64(%rcx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm5, 128(%rcx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm5, 192(%r8)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm5, (%r8)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm5, 64(%r8)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm5, 128(%r8)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm5, 192(%r9)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm5, (%r9)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm5, 64(%r9)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm5, 128(%r9)
+; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, (%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm5, 128(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, (%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 128(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, 128(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, (%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    addq $3304, %rsp # imm = 0xCE8
+; AVX512F-ONLY-SLOW-NEXT:    vzeroupper
+; AVX512F-ONLY-SLOW-NEXT:    retq
 ;
-; AVX512BW-LABEL: load_i32_stride8_vf64:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    subq $3304, %rsp # imm = 0xCE8
-; AVX512BW-NEXT:    vmovdqa64 1984(%rdi), %zmm14
-; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %zmm22
-; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %zmm6
-; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %zmm28
-; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %zmm11
-; AVX512BW-NEXT:    vmovdqa64 1344(%rdi), %zmm27
-; AVX512BW-NEXT:    vmovdqa64 1280(%rdi), %zmm25
-; AVX512BW-NEXT:    vmovdqa64 1472(%rdi), %zmm20
-; AVX512BW-NEXT:    vmovdqa64 1408(%rdi), %zmm19
-; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm8
-; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm9
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm24
-; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm10
-; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm26
-; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm31
-; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm17
-; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm7
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm13
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm15
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm12
-; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm2
-; AVX512BW-NEXT:    movb $-64, %al
-; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,0,8,16,24>
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm4
-; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm3, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm30
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm5
-; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm21
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm23
-; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm4
-; AVX512BW-NEXT:    vpermt2d %zmm26, %zmm0, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm2
-; AVX512BW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm7
-; AVX512BW-NEXT:    vpermt2d %zmm8, %zmm3, %zmm5
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm2
-; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm4
-; AVX512BW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm2
-; AVX512BW-NEXT:    vpermt2d %zmm28, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm5
-; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm3, %zmm5
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 1920(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 1856(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqa64 1792(%rdi), %zmm12
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm5
-; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm29
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %zmm16
-; AVX512BW-NEXT:    vmovdqa64 1536(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2d %zmm16, %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1728(%rdi), %zmm9
-; AVX512BW-NEXT:    vmovdqa64 1664(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2d %zmm9, %zmm1, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512BW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpermt2d %zmm28, %zmm0, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm14
-; AVX512BW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,1,9,17,25>
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm3
-; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm1, %zmm8
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm2
-; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2d %zmm26, %zmm0, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm2
-; AVX512BW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm1, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm18
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm1, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm8
-; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm8
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm5
-; AVX512BW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2d %zmm16, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2d %zmm9, %zmm2, %zmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm19
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512BW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm0, %zmm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,2,10,18,26>
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm8
-; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm1, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm27
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm9
-; AVX512BW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2d %zmm26, %zmm0, %zmm31
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm31 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm8
-; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm1, %zmm8
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm31, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm23
-; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm17
-; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm1, %zmm4
-; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm13
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm3
-; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm19
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm5
-; AVX512BW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm31
-; AVX512BW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2d %zmm13, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm2, %zmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,3,11,19,27>
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm27, %zmm1, %zmm8
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm9
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm5
-; AVX512BW-NEXT:    vpermt2d %zmm26, %zmm0, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm24
-; AVX512BW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm5 {%k1}
-; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm1, %zmm8
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm2
-; AVX512BW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm28
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm5
-; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm23
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm1, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm8
-; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm27
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm26
-; AVX512BW-NEXT:    vpermt2d %zmm19, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm29
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm22
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm0, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2d %zmm13, %zmm3, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm21
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm15, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,4,12,20,28>
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm19, %zmm1, %zmm8
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512BW-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm5
-; AVX512BW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm8
-; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm1, %zmm8
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm2
-; AVX512BW-NEXT:    vpermt2d %zmm28, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm5
-; AVX512BW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm2
-; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm1, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm8
-; AVX512BW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm9
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm23
-; AVX512BW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm22
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm22 {%k1}
-; AVX512BW-NEXT:    vpermi2d %zmm21, %zmm3, %zmm0
-; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm15, %zmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
-; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm3, %zmm0
-; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm3, %zmm25
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm2
-; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm3, %zmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,5,13,21,29>
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512BW-NEXT:    vpermt2d %zmm19, %zmm1, %zmm8
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm25, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm0
-; AVX512BW-NEXT:    vpermt2d %zmm24, %zmm3, %zmm0
-; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm1, %zmm7
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm16
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm3, %zmm16
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
-; AVX512BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm4
-; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm5, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
-; AVX512BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm25, %zmm31
-; AVX512BW-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm3, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm19
-; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm5, %zmm19
-; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm25, %zmm30
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm4
-; AVX512BW-NEXT:    vpermt2d %zmm24, %zmm5, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2d %zmm24, %zmm25, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm26
-; AVX512BW-NEXT:    vpermt2d %zmm28, %zmm3, %zmm26
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm17
-; AVX512BW-NEXT:    vpermt2d %zmm28, %zmm5, %zmm17
-; AVX512BW-NEXT:    vpermt2d %zmm28, %zmm25, %zmm18
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm28
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm3, %zmm28
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm11
-; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm5, %zmm11
-; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm25, %zmm21
-; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm15
-; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm3, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm8
-; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm5, %zmm8
-; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm25, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm22
-; AVX512BW-NEXT:    vpermt2d %zmm29, %zmm3, %zmm22
-; AVX512BW-NEXT:    vpermt2d %zmm29, %zmm5, %zmm23
-; AVX512BW-NEXT:    vpermt2d %zmm29, %zmm25, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm29
-; AVX512BW-NEXT:    vpermt2d %zmm27, %zmm3, %zmm29
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm24
-; AVX512BW-NEXT:    vpermt2d %zmm27, %zmm5, %zmm24
-; AVX512BW-NEXT:    vpermt2d %zmm27, %zmm25, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2d %zmm12, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm31
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm27, %zmm5, %zmm31
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm5, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm5, %zmm4
-; AVX512BW-NEXT:    vpermi2d %zmm12, %zmm0, %zmm5
-; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm25, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2d %zmm27, %zmm25, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm27
-; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm25, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm20
-; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm25, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm7 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm28 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm12
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,5,13,21,29>
-; AVX512BW-NEXT:    vpermt2d %zmm26, %zmm1, %zmm12
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm12, %zmm28, %zmm28
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm29 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2d %zmm22, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm29, %zmm29
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,6,14,22,30>
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm15
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm3, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm16 = <u,u,u,u,7,15,23,31>
-; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm16, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm7
-; AVX512BW-NEXT:    vpermt2d %zmm26, %zmm3, %zmm7
-; AVX512BW-NEXT:    vpermt2d %zmm26, %zmm16, %zmm14
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm3, %zmm2
-; AVX512BW-NEXT:    vpermi2d %zmm22, %zmm0, %zmm3
-; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm16, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm6
-; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm16, %zmm12
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm10 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm10, %zmm1
-; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm19, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm11 {%k1}
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm11, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm24 {%k1}
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm24, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm30 {%k1}
-; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm30, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm21 {%k1}
-; AVX512BW-NEXT:    vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm21, %zmm9
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm13 {%k1}
-; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm13, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm20 {%k1}
-; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm20, %zmm8
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm5, 192(%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm5, 128(%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm5, 64(%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm5, (%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm5, 192(%rdx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm5, (%rdx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm5, 64(%rdx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm5, 128(%rdx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm5, 192(%rcx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm5, (%rcx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm5, 64(%rcx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm5, 128(%rcx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm5, 192(%r8)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm5, (%r8)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm5, 64(%r8)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm5, 128(%r8)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm5, 192(%r9)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm5, (%r9)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm5, 64(%r9)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm5, 128(%r9)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, 192(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, (%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, 64(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm5, 128(%rax)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, 192(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, (%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, 128(%rax)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, 128(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, 192(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, (%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, 64(%rax)
-; AVX512BW-NEXT:    addq $3304, %rsp # imm = 0xCE8
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX512F-ONLY-FAST-LABEL: load_i32_stride8_vf64:
+; AVX512F-ONLY-FAST:       # %bb.0:
+; AVX512F-ONLY-FAST-NEXT:    subq $3304, %rsp # imm = 0xCE8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1984(%rdi), %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1216(%rdi), %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1152(%rdi), %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1088(%rdi), %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1024(%rdi), %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1344(%rdi), %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1280(%rdi), %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1472(%rdi), %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1408(%rdi), %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 704(%rdi), %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 640(%rdi), %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 576(%rdi), %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 512(%rdi), %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 832(%rdi), %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 768(%rdi), %zmm31
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 960(%rdi), %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 896(%rdi), %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512F-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm4, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm3, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    movb $-64, %al
+; AVX512F-ONLY-FAST-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24]
+; AVX512F-ONLY-FAST-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm5, %zmm3, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm15, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm26, %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm8, %zmm3, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm20, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm27, %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm22, %zmm3, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1920(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm14, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1856(%rdi), %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1792(%rdi), %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm4, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1600(%rdi), %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1536(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm16, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1728(%rdi), %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1664(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm9, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512F-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm28, %zmm0, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25]
+; AVX512F-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm22, %zmm1, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm7, %zmm1, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm30, %zmm1, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm16, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm9, %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512F-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm20, %zmm0, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm2 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm14, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26]
+; AVX512F-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm3, %zmm1, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm15, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm26, %zmm0, %zmm31
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm31 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm16, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm18, %zmm1, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm30, %zmm1, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm21, %zmm0, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm31
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm13, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm4, %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512F-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27]
+; AVX512F-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm27, %zmm1, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm15, %zmm0, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm16, %zmm0, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm23, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm19, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm13, %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm4, %zmm15, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512F-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm16, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28]
+; AVX512F-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm24, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm23, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm27, %zmm0, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm27, %zmm0, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm22 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm21, %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm6, %zmm15, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512F-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm20, %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm18, %zmm3, %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm16, %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512F-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm25, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm24, %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm30, %zmm1, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm0, %zmm3, %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512F-ONLY-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm0, %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512F-ONLY-FAST-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm0, %zmm25, %zmm31
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm0, %zmm3, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm0, %zmm5, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm0, %zmm25, %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm24, %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm24, %zmm25, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm28, %zmm3, %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm28, %zmm5, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm28, %zmm25, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm0, %zmm3, %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm0, %zmm5, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm0, %zmm25, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm9, %zmm3, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm9, %zmm5, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm9, %zmm25, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm3, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm5, %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm25, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm27, %zmm3, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm27, %zmm5, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm27, %zmm25, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm12, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm31
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm27, %zmm5, %zmm31
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm20, %zmm5, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm2, %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm12, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm12, %zmm25, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm27, %zmm25, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm20, %zmm25, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm2, %zmm25, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm7 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm28 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512F-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm26, %zmm1, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm28, %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm29 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm22, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm29, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30]
+; AVX512F-ONLY-FAST-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm2, %zmm3, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31]
+; AVX512F-ONLY-FAST-NEXT:    # ymm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm2, %zmm16, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm26, %zmm3, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm26, %zmm16, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm1, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm22, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm22, %zmm16, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm1, %zmm16, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm10 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm10, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm19, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm11 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm11, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm24 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm24, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm30 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm30, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm21 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm21, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm13 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm13, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm20 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm20, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm5, 192(%rsi)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm5, 128(%rsi)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm5, 64(%rsi)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm5, (%rsi)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm5, 192(%rdx)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm5, (%rdx)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm5, 64(%rdx)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm5, 128(%rdx)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm5, 192(%rcx)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm5, (%rcx)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm5, 64(%rcx)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm5, 128(%rcx)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm5, 192(%r8)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm5, (%r8)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm5, 64(%r8)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm5, 128(%r8)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm5, 192(%r9)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm5, (%r9)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm5, 64(%r9)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm5, 128(%r9)
+; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, (%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm5, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, (%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, (%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT:    addq $3304, %rsp # imm = 0xCE8
+; AVX512F-ONLY-FAST-NEXT:    vzeroupper
+; AVX512F-ONLY-FAST-NEXT:    retq
+;
+; AVX512DQ-SLOW-LABEL: load_i32_stride8_vf64:
+; AVX512DQ-SLOW:       # %bb.0:
+; AVX512DQ-SLOW-NEXT:    subq $3304, %rsp # imm = 0xCE8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1984(%rdi), %zmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1216(%rdi), %zmm22
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1152(%rdi), %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1088(%rdi), %zmm28
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1024(%rdi), %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1344(%rdi), %zmm27
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1280(%rdi), %zmm25
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1472(%rdi), %zmm20
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1408(%rdi), %zmm19
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 704(%rdi), %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 640(%rdi), %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 576(%rdi), %zmm24
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 512(%rdi), %zmm10
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 832(%rdi), %zmm26
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 768(%rdi), %zmm31
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 960(%rdi), %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 896(%rdi), %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm12
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    movb $-64, %al
+; AVX512DQ-SLOW-NEXT:    kmovw %eax, %k1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24]
+; AVX512DQ-SLOW-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm5, %zmm3, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm30
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm21
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm23
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm26, %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm8, %zmm3, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm22, %zmm3, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1920(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm14, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1856(%rdi), %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1792(%rdi), %zmm12
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm29
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1600(%rdi), %zmm16
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1536(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermi2d %zmm16, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1728(%rdi), %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1664(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermi2d %zmm9, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm28, %zmm0, %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25]
+; AVX512DQ-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm22, %zmm1, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm7, %zmm1, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm18
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm30, %zmm1, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermi2d %zmm16, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermi2d %zmm9, %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm19
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm2 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm14, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26]
+; AVX512DQ-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm3, %zmm1, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm27
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm26, %zmm0, %zmm31
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm31 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm18, %zmm1, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm23
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm17
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm30, %zmm1, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm13
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm19
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm31
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermi2d %zmm13, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermi2d %zmm4, %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27]
+; AVX512DQ-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm27, %zmm1, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm24
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm10
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm28
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm23
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm27
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm26
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm19, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm29
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm22
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermi2d %zmm13, %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm21
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermi2d %zmm4, %zmm15, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28]
+; AVX512DQ-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm9
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm23
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm22
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm22 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpermi2d %zmm21, %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermi2d %zmm6, %zmm15, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512DQ-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm20, %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm18, %zmm3, %zmm25
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm16, %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512DQ-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm25, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm24, %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm30, %zmm1, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm16
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm0, %zmm3, %zmm16
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512DQ-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm0, %zmm5, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512DQ-SLOW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm0, %zmm25, %zmm31
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm0, %zmm3, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm19
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm0, %zmm5, %zmm19
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm0, %zmm25, %zmm30
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm24, %zmm5, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm24, %zmm25, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm18
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm26
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm28, %zmm3, %zmm26
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm17
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm28, %zmm5, %zmm17
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm28, %zmm25, %zmm18
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm28
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm0, %zmm3, %zmm28
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm0, %zmm5, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm0, %zmm25, %zmm21
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm15
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm9, %zmm3, %zmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm9, %zmm5, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm9, %zmm25, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm22
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm29, %zmm3, %zmm22
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm29, %zmm5, %zmm23
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm29, %zmm25, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm29
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm27, %zmm3, %zmm29
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm24
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm27, %zmm5, %zmm24
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm27, %zmm25, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermi2d %zmm12, %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm31
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm27, %zmm5, %zmm31
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm10
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm20, %zmm5, %zmm10
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm2, %zmm5, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermi2d %zmm12, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm12, %zmm25, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm27, %zmm25, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm27
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm20, %zmm25, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm20
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm2, %zmm25, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm7 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm28 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm12
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512DQ-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm26, %zmm1, %zmm12
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm28, %zmm28
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm29 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermi2d %zmm22, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm29, %zmm29
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30]
+; AVX512DQ-SLOW-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm2, %zmm3, %zmm15
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31]
+; AVX512DQ-SLOW-NEXT:    # ymm16 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm2, %zmm16, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm26, %zmm3, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm26, %zmm16, %zmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm1, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermi2d %zmm22, %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm22, %zmm16, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm1, %zmm16, %zmm12
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm10 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm10, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm19, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm11 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm11, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm24 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm24, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm30 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm30, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm21 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm21, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm13 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm13, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm20 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm20, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm5, 192(%rsi)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm5, 128(%rsi)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm5, 64(%rsi)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm5, (%rsi)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm5, 192(%rdx)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm5, (%rdx)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm5, 64(%rdx)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm5, 128(%rdx)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm5, 192(%rcx)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm5, (%rcx)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm5, 64(%rcx)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm5, 128(%rcx)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm5, 192(%r8)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm5, (%r8)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm5, 64(%r8)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm5, 128(%r8)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm5, 192(%r9)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm5, (%r9)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm5, 64(%r9)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm5, 128(%r9)
+; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, 192(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, (%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, 64(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm5, 128(%rax)
+; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, 192(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, (%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, 128(%rax)
+; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, 128(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, 192(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, (%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512DQ-SLOW-NEXT:    addq $3304, %rsp # imm = 0xCE8
+; AVX512DQ-SLOW-NEXT:    vzeroupper
+; AVX512DQ-SLOW-NEXT:    retq
+;
+; AVX512DQ-FAST-LABEL: load_i32_stride8_vf64:
+; AVX512DQ-FAST:       # %bb.0:
+; AVX512DQ-FAST-NEXT:    subq $3304, %rsp # imm = 0xCE8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1984(%rdi), %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1216(%rdi), %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1152(%rdi), %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1088(%rdi), %zmm28
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1024(%rdi), %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1344(%rdi), %zmm27
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1280(%rdi), %zmm25
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1472(%rdi), %zmm20
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1408(%rdi), %zmm19
+; AVX512DQ-FAST-NEXT:    vmovdqa64 704(%rdi), %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 640(%rdi), %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 576(%rdi), %zmm24
+; AVX512DQ-FAST-NEXT:    vmovdqa64 512(%rdi), %zmm10
+; AVX512DQ-FAST-NEXT:    vmovdqa64 832(%rdi), %zmm26
+; AVX512DQ-FAST-NEXT:    vmovdqa64 768(%rdi), %zmm31
+; AVX512DQ-FAST-NEXT:    vmovdqa64 960(%rdi), %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqa64 896(%rdi), %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdi), %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm15
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512DQ-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm4, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm3, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    movb $-64, %al
+; AVX512DQ-FAST-NEXT:    kmovw %eax, %k1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24]
+; AVX512DQ-FAST-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm5, %zmm3, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm30
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm15, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm15, %zmm21
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm23
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm15
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm26, %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm8, %zmm3, %zmm5
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm20, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm27, %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm22, %zmm3, %zmm5
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1920(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm14, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1856(%rdi), %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1792(%rdi), %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm4, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm29
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1600(%rdi), %zmm16
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1536(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermi2d %zmm16, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1728(%rdi), %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1664(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermi2d %zmm9, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512DQ-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm28, %zmm0, %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25]
+; AVX512DQ-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm22, %zmm1, %zmm8
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm7, %zmm1, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm18
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm30, %zmm1, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermi2d %zmm16, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermi2d %zmm9, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512DQ-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm20, %zmm0, %zmm19
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm2 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm14, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26]
+; AVX512DQ-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm3, %zmm1, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm27
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm15, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm26, %zmm0, %zmm31
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm31 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm16, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm18, %zmm1, %zmm8
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm23
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, %zmm17
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm30, %zmm1, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm21, %zmm0, %zmm13
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm19
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, %zmm31
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermi2d %zmm13, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermi2d %zmm4, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512DQ-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27]
+; AVX512DQ-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm27, %zmm1, %zmm8
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm15, %zmm0, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, %zmm24
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm16, %zmm0, %zmm10
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm23, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm28
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, %zmm23
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm27
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm26
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm19, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm29
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermi2d %zmm13, %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm21
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermi2d %zmm4, %zmm15, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512DQ-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm16, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28]
+; AVX512DQ-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm24, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm23, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm27, %zmm0, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, %zmm9
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, %zmm23
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm29, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm27, %zmm0, %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm22 {%k1}
+; AVX512DQ-FAST-NEXT:    vpermi2d %zmm21, %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermi2d %zmm6, %zmm15, %zmm1
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512DQ-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm20, %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm18, %zmm3, %zmm25
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm16, %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512DQ-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm25, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm24, %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm30, %zmm1, %zmm7
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm16
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm0, %zmm3, %zmm16
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512DQ-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm0, %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512DQ-FAST-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm0, %zmm25, %zmm31
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm0, %zmm3, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, %zmm19
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm0, %zmm5, %zmm19
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm0, %zmm25, %zmm30
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm24, %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm24, %zmm25, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm18
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm26
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm28, %zmm3, %zmm26
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm17
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm28, %zmm5, %zmm17
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm28, %zmm25, %zmm18
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm28
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm0, %zmm3, %zmm28
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm11
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm0, %zmm5, %zmm11
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm0, %zmm25, %zmm21
+; AVX512DQ-FAST-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm15
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm9, %zmm3, %zmm15
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm9, %zmm5, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm9, %zmm25, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm22
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm29, %zmm3, %zmm22
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm29, %zmm5, %zmm23
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm29, %zmm25, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm29
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm27, %zmm3, %zmm29
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm24
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm27, %zmm5, %zmm24
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm27, %zmm25, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermi2d %zmm12, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm31
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm27, %zmm5, %zmm31
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm10
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm20, %zmm5, %zmm10
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm2, %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermi2d %zmm12, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm12, %zmm25, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm27, %zmm25, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm27
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm20, %zmm25, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm20
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm2, %zmm25, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm7 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, %zmm28 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512DQ-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm26, %zmm1, %zmm12
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm28, %zmm28
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, %zmm29 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermi2d %zmm22, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm29, %zmm29
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30]
+; AVX512DQ-FAST-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm15
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm2, %zmm3, %zmm15
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31]
+; AVX512DQ-FAST-NEXT:    # ymm16 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm2, %zmm16, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm26, %zmm3, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm26, %zmm16, %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm1, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermi2d %zmm22, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm22, %zmm16, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2d %zmm1, %zmm16, %zmm12
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm10 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm10, %zmm1
+; AVX512DQ-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm19, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm11 {%k1}
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm11, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm24 {%k1}
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm24, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm30 {%k1}
+; AVX512DQ-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm30, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm21 {%k1}
+; AVX512DQ-FAST-NEXT:    vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm21, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm13 {%k1}
+; AVX512DQ-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm13, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, %zmm20 {%k1}
+; AVX512DQ-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm20, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm5, 192(%rsi)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm5, 128(%rsi)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm5, 64(%rsi)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm5, (%rsi)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm5, 192(%rdx)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm5, (%rdx)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm5, 64(%rdx)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm5, 128(%rdx)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm5, 192(%rcx)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm5, (%rcx)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm5, 64(%rcx)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm5, 128(%rcx)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm5, 192(%r8)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm5, (%r8)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm5, 64(%r8)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm5, 128(%r8)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm5, 192(%r9)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm5, (%r9)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm5, 64(%r9)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm5, 128(%r9)
+; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, 192(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, (%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, 64(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm5, 128(%rax)
+; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, 192(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, (%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, 128(%rax)
+; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, 128(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, 192(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, (%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512DQ-FAST-NEXT:    addq $3304, %rsp # imm = 0xCE8
+; AVX512DQ-FAST-NEXT:    vzeroupper
+; AVX512DQ-FAST-NEXT:    retq
+;
+; AVX512BW-ONLY-SLOW-LABEL: load_i32_stride8_vf64:
+; AVX512BW-ONLY-SLOW:       # %bb.0:
+; AVX512BW-ONLY-SLOW-NEXT:    subq $3304, %rsp # imm = 0xCE8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1984(%rdi), %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1216(%rdi), %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1152(%rdi), %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1088(%rdi), %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1024(%rdi), %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1344(%rdi), %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1280(%rdi), %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1472(%rdi), %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1408(%rdi), %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 704(%rdi), %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 640(%rdi), %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 576(%rdi), %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 512(%rdi), %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 832(%rdi), %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 768(%rdi), %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 960(%rdi), %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 896(%rdi), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    movb $-64, %al
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %eax, %k1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm5, %zmm3, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm26, %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm8, %zmm3, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm22, %zmm3, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1920(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm14, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1856(%rdi), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1792(%rdi), %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1600(%rdi), %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1536(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2d %zmm16, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1728(%rdi), %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1664(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2d %zmm9, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm28, %zmm0, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm22, %zmm1, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm7, %zmm1, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm30, %zmm1, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2d %zmm16, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2d %zmm9, %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm2 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm14, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm3, %zmm1, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm26, %zmm0, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm31 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm18, %zmm1, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm30, %zmm1, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2d %zmm13, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2d %zmm4, %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm27, %zmm1, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm19, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2d %zmm13, %zmm3, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2d %zmm4, %zmm15, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm22 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2d %zmm21, %zmm3, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2d %zmm6, %zmm15, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm20, %zmm3, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm18, %zmm3, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm16, %zmm3, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm25, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm24, %zmm3, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm30, %zmm1, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm0, %zmm3, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm0, %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm0, %zmm25, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm0, %zmm3, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm0, %zmm5, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm0, %zmm25, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm24, %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm24, %zmm25, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm28, %zmm3, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm28, %zmm5, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm28, %zmm25, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm0, %zmm3, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm0, %zmm5, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm0, %zmm25, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm9, %zmm3, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm9, %zmm5, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm9, %zmm25, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm3, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm5, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm29, %zmm25, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm27, %zmm3, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm27, %zmm5, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm27, %zmm25, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2d %zmm12, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm27, %zmm5, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm20, %zmm5, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm2, %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2d %zmm12, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm12, %zmm25, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm27, %zmm25, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm20, %zmm25, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm2, %zmm25, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm7 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm28 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm26, %zmm1, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm28, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm29 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2d %zmm22, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm29, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm2, %zmm3, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm2, %zmm16, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm26, %zmm3, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm26, %zmm16, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm1, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2d %zmm22, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm22, %zmm16, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2d %zmm1, %zmm16, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm10 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm10, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm19, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm11 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm11, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm24 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm24, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm30 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm30, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm21 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm21, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm13 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm13, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm20 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm20, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm5, 192(%rsi)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm5, 128(%rsi)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm5, 64(%rsi)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm5, (%rsi)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm5, 192(%rdx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm5, (%rdx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm5, 64(%rdx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm5, 128(%rdx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm5, 192(%rcx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm5, (%rcx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm5, 64(%rcx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm5, 128(%rcx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm5, 192(%r8)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm5, (%r8)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm5, 64(%r8)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm5, 128(%r8)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm5, 192(%r9)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm5, (%r9)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm5, 64(%r9)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm5, 128(%r9)
+; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm5, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    addq $3304, %rsp # imm = 0xCE8
+; AVX512BW-ONLY-SLOW-NEXT:    vzeroupper
+; AVX512BW-ONLY-SLOW-NEXT:    retq
+;
+; AVX512BW-ONLY-FAST-LABEL: load_i32_stride8_vf64:
+; AVX512BW-ONLY-FAST:       # %bb.0:
+; AVX512BW-ONLY-FAST-NEXT:    subq $3304, %rsp # imm = 0xCE8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1984(%rdi), %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1216(%rdi), %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1152(%rdi), %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1088(%rdi), %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1024(%rdi), %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1344(%rdi), %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1280(%rdi), %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1472(%rdi), %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1408(%rdi), %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 704(%rdi), %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 640(%rdi), %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 576(%rdi), %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 512(%rdi), %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 832(%rdi), %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 768(%rdi), %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 960(%rdi), %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 896(%rdi), %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm4, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm3, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    movb $-64, %al
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %eax, %k1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm5, %zmm3, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm15, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm26, %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm8, %zmm3, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm20, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm27, %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm22, %zmm3, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1920(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm14, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1856(%rdi), %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1792(%rdi), %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm4, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1600(%rdi), %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1536(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2d %zmm16, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1728(%rdi), %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1664(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2d %zmm9, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm28, %zmm0, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm22, %zmm1, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm7, %zmm1, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm30, %zmm1, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2d %zmm16, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2d %zmm9, %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm20, %zmm0, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm2 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm14, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm3, %zmm1, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm15, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm26, %zmm0, %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm31 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm16, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm18, %zmm1, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm30, %zmm1, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm21, %zmm0, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2d %zmm13, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2d %zmm4, %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm27, %zmm1, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm15, %zmm0, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm16, %zmm0, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm23, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm19, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2d %zmm13, %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2d %zmm4, %zmm15, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm16, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm24, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm23, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm27, %zmm0, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm27, %zmm0, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm22 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2d %zmm21, %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2d %zmm6, %zmm15, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm20, %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm18, %zmm3, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm16, %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm25, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm24, %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm30, %zmm1, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm0, %zmm3, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm0, %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm0, %zmm25, %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm0, %zmm3, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm0, %zmm5, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm0, %zmm25, %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm24, %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm24, %zmm25, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm28, %zmm3, %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm28, %zmm5, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm28, %zmm25, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm0, %zmm3, %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm0, %zmm5, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm0, %zmm25, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm9, %zmm3, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm9, %zmm5, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm9, %zmm25, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm3, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm5, %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm29, %zmm25, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm27, %zmm3, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm27, %zmm5, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm27, %zmm25, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2d %zmm12, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm27, %zmm5, %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm20, %zmm5, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm2, %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2d %zmm12, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm12, %zmm25, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm27, %zmm25, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm20, %zmm25, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm2, %zmm25, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm7 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm28 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm26, %zmm1, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm28, %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm29 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2d %zmm22, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm29, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm2, %zmm3, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm2, %zmm16, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm26, %zmm3, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm26, %zmm16, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm1, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2d %zmm22, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm22, %zmm16, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2d %zmm1, %zmm16, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm10 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm10, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm19, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm11 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm11, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm24 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm24, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm30 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm30, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm21 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm21, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm13 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm13, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm20 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm20, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm5, 192(%rsi)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm5, 128(%rsi)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm5, 64(%rsi)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm5, (%rsi)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm5, 192(%rdx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm5, (%rdx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm5, 64(%rdx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm5, 128(%rdx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm5, 192(%rcx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm5, (%rcx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm5, 64(%rcx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm5, 128(%rcx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm5, 192(%r8)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm5, (%r8)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm5, 64(%r8)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm5, 128(%r8)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm5, 192(%r9)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm5, (%r9)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm5, 64(%r9)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm5, 128(%r9)
+; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, (%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm5, 128(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, (%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 128(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, 128(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, (%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    addq $3304, %rsp # imm = 0xCE8
+; AVX512BW-ONLY-FAST-NEXT:    vzeroupper
+; AVX512BW-ONLY-FAST-NEXT:    retq
+;
+; AVX512DQBW-SLOW-LABEL: load_i32_stride8_vf64:
+; AVX512DQBW-SLOW:       # %bb.0:
+; AVX512DQBW-SLOW-NEXT:    subq $3304, %rsp # imm = 0xCE8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1984(%rdi), %zmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1216(%rdi), %zmm22
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1152(%rdi), %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1088(%rdi), %zmm28
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1024(%rdi), %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1344(%rdi), %zmm27
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1280(%rdi), %zmm25
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1472(%rdi), %zmm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1408(%rdi), %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 704(%rdi), %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 640(%rdi), %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 576(%rdi), %zmm24
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 512(%rdi), %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 832(%rdi), %zmm26
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 768(%rdi), %zmm31
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 960(%rdi), %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 896(%rdi), %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    movb $-64, %al
+; AVX512DQBW-SLOW-NEXT:    kmovd %eax, %k1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24]
+; AVX512DQBW-SLOW-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm5, %zmm3, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm30
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm26, %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm8, %zmm3, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm22, %zmm3, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1920(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm14, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1856(%rdi), %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1792(%rdi), %zmm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1600(%rdi), %zmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1536(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermi2d %zmm16, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1728(%rdi), %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1664(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermi2d %zmm9, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm28, %zmm0, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25]
+; AVX512DQBW-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm22, %zmm1, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm7, %zmm1, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm30, %zmm1, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermi2d %zmm16, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermi2d %zmm9, %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm2 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm14, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26]
+; AVX512DQBW-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm3, %zmm1, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm26, %zmm0, %zmm31
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm31 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm18, %zmm1, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm30, %zmm1, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm31
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermi2d %zmm13, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermi2d %zmm4, %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27]
+; AVX512DQBW-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm27, %zmm1, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm28
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm26
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm19, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermi2d %zmm13, %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermi2d %zmm4, %zmm15, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28]
+; AVX512DQBW-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm22 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpermi2d %zmm21, %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermi2d %zmm6, %zmm15, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512DQBW-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm20, %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm18, %zmm3, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm16, %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512DQBW-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm25, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm24, %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm30, %zmm1, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm0, %zmm3, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512DQBW-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm0, %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512DQBW-SLOW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm0, %zmm25, %zmm31
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm0, %zmm3, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm0, %zmm5, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm0, %zmm25, %zmm30
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm24, %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm24, %zmm25, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm26
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm28, %zmm3, %zmm26
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm28, %zmm5, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm28, %zmm25, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm28
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm0, %zmm3, %zmm28
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm0, %zmm5, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm0, %zmm25, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm9, %zmm3, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm9, %zmm5, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm9, %zmm25, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm29, %zmm3, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm29, %zmm5, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm29, %zmm25, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm27, %zmm3, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm27, %zmm5, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm27, %zmm25, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermi2d %zmm12, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm31
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm27, %zmm5, %zmm31
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm20, %zmm5, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm2, %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermi2d %zmm12, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm12, %zmm25, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm27, %zmm25, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm20, %zmm25, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm2, %zmm25, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm7 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm28 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512DQBW-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm26, %zmm1, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm28, %zmm28
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm29 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermi2d %zmm22, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm29, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30]
+; AVX512DQBW-SLOW-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm2, %zmm3, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31]
+; AVX512DQBW-SLOW-NEXT:    # ymm16 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm2, %zmm16, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm26, %zmm3, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm26, %zmm16, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm1, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermi2d %zmm22, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm22, %zmm16, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2d %zmm1, %zmm16, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm10 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm10, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm19, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm11 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm11, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm24 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm24, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm30 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm30, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm21 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm21, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm13 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm13, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm20 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm20, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm5, 192(%rsi)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm5, 128(%rsi)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm5, 64(%rsi)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm5, (%rsi)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm5, 192(%rdx)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm5, (%rdx)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm5, 64(%rdx)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm5, 128(%rdx)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm5, 192(%rcx)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm5, (%rcx)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm5, 64(%rcx)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm5, 128(%rcx)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm5, 192(%r8)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm5, (%r8)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm5, 64(%r8)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm5, 128(%r8)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm5, 192(%r9)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm5, (%r9)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm5, 64(%r9)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm5, 128(%r9)
+; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, 192(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, (%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, 64(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm5, 128(%rax)
+; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, 192(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, (%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, 128(%rax)
+; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, 128(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, 192(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, (%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512DQBW-SLOW-NEXT:    addq $3304, %rsp # imm = 0xCE8
+; AVX512DQBW-SLOW-NEXT:    vzeroupper
+; AVX512DQBW-SLOW-NEXT:    retq
+;
+; AVX512DQBW-FAST-LABEL: load_i32_stride8_vf64:
+; AVX512DQBW-FAST:       # %bb.0:
+; AVX512DQBW-FAST-NEXT:    subq $3304, %rsp # imm = 0xCE8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1984(%rdi), %zmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1216(%rdi), %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1152(%rdi), %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1088(%rdi), %zmm28
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1024(%rdi), %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1344(%rdi), %zmm27
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1280(%rdi), %zmm25
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1472(%rdi), %zmm20
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1408(%rdi), %zmm19
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 704(%rdi), %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 640(%rdi), %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 576(%rdi), %zmm24
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 512(%rdi), %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 832(%rdi), %zmm26
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 768(%rdi), %zmm31
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 960(%rdi), %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 896(%rdi), %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512DQBW-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm4, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm3, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    movb $-64, %al
+; AVX512DQBW-FAST-NEXT:    kmovd %eax, %k1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24]
+; AVX512DQBW-FAST-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm5, %zmm3, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm30
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm15, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm15, %zmm21
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm23
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm26, %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm8, %zmm3, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm20, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm27, %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm22, %zmm3, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1920(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm14, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1856(%rdi), %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1792(%rdi), %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm4, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm29
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1600(%rdi), %zmm16
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1536(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermi2d %zmm16, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1728(%rdi), %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1664(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermi2d %zmm9, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512DQBW-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm28, %zmm0, %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25]
+; AVX512DQBW-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm22, %zmm1, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm7, %zmm1, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm18
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm30, %zmm1, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermi2d %zmm16, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermi2d %zmm9, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
+; AVX512DQBW-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm20, %zmm0, %zmm19
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm27, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, %zmm2 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm14, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26]
+; AVX512DQBW-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm3, %zmm1, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm27
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm15, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm26, %zmm0, %zmm31
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm31 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm16, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm14, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm18, %zmm1, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm23
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, %zmm17
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm30, %zmm1, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm21, %zmm0, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm7, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm19
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm29, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, %zmm31
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermi2d %zmm13, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermi2d %zmm4, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512DQBW-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27]
+; AVX512DQBW-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm27, %zmm1, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm15, %zmm0, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm26, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, %zmm24
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm16, %zmm0, %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm14, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm23, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm28
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm22, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, %zmm23
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm21, %zmm0, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm27
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm26
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm19, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, %zmm29
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm31, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermi2d %zmm13, %zmm3, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm21
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermi2d %zmm4, %zmm15, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512DQBW-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm16, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28]
+; AVX512DQBW-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm24, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm30, %zmm1, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm28, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm23, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm14, %zmm1, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm27, %zmm0, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, %zmm9
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, %zmm23
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm29, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm27, %zmm0, %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm22 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpermi2d %zmm21, %zmm3, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermi2d %zmm6, %zmm15, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512DQBW-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm20, %zmm3, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm18, %zmm3, %zmm25
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm16, %zmm3, %zmm0
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512DQBW-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm19, %zmm1, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm25, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm24, %zmm3, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm30, %zmm1, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm16
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm0, %zmm3, %zmm16
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512DQBW-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm0, %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512DQBW-FAST-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm0, %zmm25, %zmm31
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm0, %zmm3, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm19
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm0, %zmm5, %zmm19
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm0, %zmm25, %zmm30
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm24, %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm24, %zmm25, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm18
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm26
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm28, %zmm3, %zmm26
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm17
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm28, %zmm5, %zmm17
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm28, %zmm25, %zmm18
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm28
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm0, %zmm3, %zmm28
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm0, %zmm5, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm0, %zmm25, %zmm21
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm15
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm9, %zmm3, %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm9, %zmm5, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm9, %zmm25, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm22
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm29, %zmm3, %zmm22
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm29, %zmm5, %zmm23
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm29, %zmm25, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm29
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm27, %zmm3, %zmm29
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm24
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm27, %zmm5, %zmm24
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm27, %zmm25, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermi2d %zmm12, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm31
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm27, %zmm5, %zmm31
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm20, %zmm5, %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm2, %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermi2d %zmm12, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm12, %zmm25, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm27, %zmm25, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm27
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm20, %zmm25, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm20
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm2, %zmm25, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm7 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, %zmm28 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm14, %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29]
+; AVX512DQBW-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm26, %zmm1, %zmm12
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm28, %zmm28
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, %zmm29 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermi2d %zmm22, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm29, %zmm29
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30]
+; AVX512DQBW-FAST-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm2, %zmm3, %zmm15
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31]
+; AVX512DQBW-FAST-NEXT:    # ymm16 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm2, %zmm16, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm14, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm26, %zmm3, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm26, %zmm16, %zmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm1, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermi2d %zmm22, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm22, %zmm16, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2d %zmm1, %zmm16, %zmm12
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm10 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm10, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm19, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm11 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm11, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm24 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm24, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm30 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm30, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm21 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm21, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm13 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm13, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, %zmm20 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm20, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm5, 192(%rsi)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm5, 128(%rsi)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm5, 64(%rsi)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm5, (%rsi)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm5, 192(%rdx)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm5, (%rdx)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm5, 64(%rdx)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm5, 128(%rdx)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm5, 192(%rcx)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm5, (%rcx)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm5, 64(%rcx)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm5, 128(%rcx)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm5, 192(%r8)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm5, (%r8)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm5, 64(%r8)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm5, 128(%r8)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm5, 192(%r9)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm5, (%r9)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm5, 64(%r9)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm5, 128(%r9)
+; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, 192(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, (%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, 64(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm5, 128(%rax)
+; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, 192(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, (%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, 128(%rax)
+; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, 128(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, 192(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, (%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512DQBW-FAST-NEXT:    addq $3304, %rsp # imm = 0xCE8
+; AVX512DQBW-FAST-NEXT:    vzeroupper
+; AVX512DQBW-FAST-NEXT:    retq
   %wide.vec = load <512 x i32>, ptr %in.vec, align 64
   %strided.vec0 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120, i32 128, i32 136, i32 144, i32 152, i32 160, i32 168, i32 176, i32 184, i32 192, i32 200, i32 208, i32 216, i32 224, i32 232, i32 240, i32 248, i32 256, i32 264, i32 272, i32 280, i32 288, i32 296, i32 304, i32 312, i32 320, i32 328, i32 336, i32 344, i32 352, i32 360, i32 368, i32 376, i32 384, i32 392, i32 400, i32 408, i32 416, i32 424, i32 432, i32 440, i32 448, i32 456, i32 464, i32 472, i32 480, i32 488, i32 496, i32 504>
   %strided.vec1 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 65, i32 73, i32 81, i32 89, i32 97, i32 105, i32 113, i32 121, i32 129, i32 137, i32 145, i32 153, i32 161, i32 169, i32 177, i32 185, i32 193, i32 201, i32 209, i32 217, i32 225, i32 233, i32 241, i32 249, i32 257, i32 265, i32 273, i32 281, i32 289, i32 297, i32 305, i32 313, i32 321, i32 329, i32 337, i32 345, i32 353, i32 361, i32 369, i32 377, i32 385, i32 393, i32 401, i32 409, i32 417, i32 425, i32 433, i32 441, i32 449, i32 457, i32 465, i32 473, i32 481, i32 489, i32 497, i32 505>
@@ -9358,14 +14752,6 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST: {{.*}}
 ; AVX2-FAST-PERLANE: {{.*}}
 ; AVX2-SLOW: {{.*}}
-; AVX512BW-ONLY-FAST: {{.*}}
-; AVX512BW-ONLY-SLOW: {{.*}}
-; AVX512DQ-FAST: {{.*}}
-; AVX512DQ-SLOW: {{.*}}
-; AVX512DQBW-FAST: {{.*}}
-; AVX512DQBW-SLOW: {{.*}}
-; AVX512F-ONLY-FAST: {{.*}}
-; AVX512F-ONLY-SLOW: {{.*}}
 ; FALLBACK0: {{.*}}
 ; FALLBACK1: {{.*}}
 ; FALLBACK10: {{.*}}

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll
index 3eca48fbddbce..f14a54a8e93ca 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll
@@ -279,7 +279,8 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <11,1,7,u>
 ; AVX512F-FAST-NEXT:    vpermi2q %zmm2, %zmm3, %zmm8
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,0,6>
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,6,0,6]
+; AVX512F-FAST-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vpermi2q %ymm7, %ymm4, %ymm8
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,10]
 ; AVX512F-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm4
@@ -362,7 +363,8 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <11,1,7,u>
 ; AVX512BW-FAST-NEXT:    vpermi2q %zmm2, %zmm3, %zmm8
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,0,6>
+; AVX512BW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,6,0,6]
+; AVX512BW-FAST-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX512BW-FAST-NEXT:    vpermi2q %ymm7, %ymm4, %ymm8
 ; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,10]
 ; AVX512BW-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm4

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
index 49a2bfaeb0539..d47558166f061 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
@@ -332,7 +332,8 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm9 = [5,12]
 ; AVX512F-SLOW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm9
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,4,11>
+; AVX512F-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [4,11,4,11]
+; AVX512F-SLOW-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512F-SLOW-NEXT:    vpermi2q 192(%rdi), %zmm3, %zmm9
 ; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = [6,13]
 ; AVX512F-SLOW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm3
@@ -362,7 +363,8 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vpermi2q %zmm5, %zmm4, %zmm1
 ; AVX512F-FAST-NEXT:    vpbroadcastq 176(%rdi), %ymm2
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,0,7>
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,7,0,7]
+; AVX512F-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vmovdqa 128(%rdi), %ymm6
 ; AVX512F-FAST-NEXT:    vpermi2q 160(%rdi), %ymm6, %ymm2
 ; AVX512F-FAST-NEXT:    vmovdqa 16(%rdi), %xmm7
@@ -384,7 +386,8 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = [5,12]
 ; AVX512F-FAST-NEXT:    vpermi2q %zmm4, %zmm5, %zmm9
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,4,11>
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [4,11,4,11]
+; AVX512F-FAST-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vpermi2q 192(%rdi), %zmm3, %zmm9
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [6,13]
 ; AVX512F-FAST-NEXT:    vpermi2q %zmm4, %zmm5, %zmm3
@@ -437,7 +440,8 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm9 = [5,12]
 ; AVX512BW-SLOW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm9
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,4,11>
+; AVX512BW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [4,11,4,11]
+; AVX512BW-SLOW-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512BW-SLOW-NEXT:    vpermi2q 192(%rdi), %zmm3, %zmm9
 ; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = [6,13]
 ; AVX512BW-SLOW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm3
@@ -467,7 +471,8 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FAST-NEXT:    vpermi2q %zmm5, %zmm4, %zmm1
 ; AVX512BW-FAST-NEXT:    vpbroadcastq 176(%rdi), %ymm2
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,0,7>
+; AVX512BW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,7,0,7]
+; AVX512BW-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512BW-FAST-NEXT:    vmovdqa 128(%rdi), %ymm6
 ; AVX512BW-FAST-NEXT:    vpermi2q 160(%rdi), %ymm6, %ymm2
 ; AVX512BW-FAST-NEXT:    vmovdqa 16(%rdi), %xmm7
@@ -489,7 +494,8 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = [5,12]
 ; AVX512BW-FAST-NEXT:    vpermi2q %zmm4, %zmm5, %zmm9
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,4,11>
+; AVX512BW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [4,11,4,11]
+; AVX512BW-FAST-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512BW-FAST-NEXT:    vpermi2q 192(%rdi), %zmm3, %zmm9
 ; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [6,13]
 ; AVX512BW-FAST-NEXT:    vpermi2q %zmm4, %zmm5, %zmm3
@@ -900,7 +906,8 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm15 = [4,11]
 ; AVX512F-NEXT:    vpermi2q %zmm2, %zmm5, %zmm15
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,4,11>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [4,11,4,11]
+; AVX512F-NEXT:    # ymm15 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpermi2q %zmm6, %zmm7, %zmm15
 ; AVX512F-NEXT:    vpermt2q %zmm6, %zmm8, %zmm7
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = <9,0,7,u>
@@ -1009,7 +1016,8 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm14 = [4,11]
 ; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm5, %zmm14
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,4,11>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [4,11,4,11]
+; AVX512BW-NEXT:    # ymm14 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm7, %zmm14
 ; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm8, %zmm7
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = <9,0,7,u>
@@ -1976,7 +1984,8 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [4,5,8,15,4,5,8,15]
 ; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermt2q %zmm14, %zmm8, %zmm9
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,4,11>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [4,11,4,11]
+; AVX512F-NEXT:    # ymm11 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpermt2q %zmm27, %zmm11, %zmm10
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm12 = [6,13]
 ; AVX512F-NEXT:    vpermt2q %zmm26, %zmm12, %zmm1
@@ -2161,7 +2170,8 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [4,5,8,15,4,5,8,15]
 ; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm8, %zmm9
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,4,11>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [4,11,4,11]
+; AVX512BW-NEXT:    # ymm11 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm11, %zmm10
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm12 = [6,13]
 ; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm12, %zmm1
@@ -3776,880 +3786,3513 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;
-; AVX512F-LABEL: load_i64_stride7_vf32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    subq $2152, %rsp # imm = 0x868
-; AVX512F-NEXT:    vmovdqa64 1664(%rdi), %zmm17
-; AVX512F-NEXT:    vmovdqa64 1600(%rdi), %zmm0
-; AVX512F-NEXT:    vmovdqa64 1216(%rdi), %zmm16
-; AVX512F-NEXT:    vmovdqa64 1152(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqa64 1088(%rdi), %zmm19
-; AVX512F-NEXT:    vmovaps 1024(%rdi), %zmm1
-; AVX512F-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 768(%rdi), %zmm21
-; AVX512F-NEXT:    vmovdqa64 704(%rdi), %zmm9
-; AVX512F-NEXT:    vmovdqa64 640(%rdi), %zmm12
-; AVX512F-NEXT:    vmovaps 576(%rdi), %zmm3
-; AVX512F-NEXT:    vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm18
-; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm3
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [10,3,10,3,10,3,10,3]
-; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm4, %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm4, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm4, %zmm6
-; AVX512F-NEXT:    vpermi2q %zmm17, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4]
-; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm4, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm11
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [13,6,13,6,13,6,13,6]
-; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm11, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm11, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
-; AVX512F-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm22, %zmm13
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm24
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm13
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm22, %zmm13
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm27
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm4, %zmm13
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm28
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm13
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm4, %zmm13
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm30
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0]
-; AVX512F-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm26, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm23
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm26, %zmm23
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0]
-; AVX512F-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm15
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm20, %zmm15
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
-; AVX512F-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm1, %zmm9
-; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm20, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm1, %zmm7
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm4, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm4, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm29
-; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm4, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm18
-; AVX512F-NEXT:    vpermi2q %zmm17, %zmm0, %zmm18
-; AVX512F-NEXT:    vmovdqa64 1536(%rdi), %zmm13
-; AVX512F-NEXT:    vmovdqa64 1472(%rdi), %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm4, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm21
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm22, %zmm21
-; AVX512F-NEXT:    vpermi2q %zmm17, %zmm0, %zmm22
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm31
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm4, %zmm31
-; AVX512F-NEXT:    vpermi2q %zmm17, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm10
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm26, %zmm11
-; AVX512F-NEXT:    vpermi2q %zmm17, %zmm0, %zmm26
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm25
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm20, %zmm25
-; AVX512F-NEXT:    vpermi2q %zmm0, %zmm17, %zmm20
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm1, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm17
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm1, %zmm2
-; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm1
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9]
-; AVX512F-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 832(%rdi), %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm3, %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1280(%rdi), %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm3, %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1728(%rdi), %zmm6
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm3, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10]
-; AVX512F-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm3, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm3, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm3, %zmm18
-; AVX512F-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11]
-; AVX512F-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm3, %zmm21
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm3, %zmm24
-; AVX512F-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm3, %zmm27
-; AVX512F-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm3, %zmm22
-; AVX512F-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12]
-; AVX512F-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm3, %zmm31
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm3, %zmm28
-; AVX512F-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm3, %zmm30
-; AVX512F-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm3, %zmm10
-; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13]
-; AVX512F-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm3, %zmm11
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm3, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm3, %zmm23
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm3, %zmm26
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14]
-; AVX512F-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm3, %zmm25
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm3, %zmm15
-; AVX512F-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm3, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm3, %zmm20
-; AVX512F-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15]
-; AVX512F-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm3, %zmm9
-; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm3, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm3, %zmm17
-; AVX512F-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    movb $24, %al
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovdqa64 512(%rdi), %zmm7
-; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm15 = <0,7,14,u>
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm17
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm15, %zmm17
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1]
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
-; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm28
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm0, %zmm28
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0]
-; AVX512F-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm22
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm18, %zmm22
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm24 = <u,u,4,11>
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm24, %zmm14
-; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm14
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm16
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm15, %zmm16
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[0,1,4,5],zmm29[4,5,0,1]
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm27
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm0, %zmm27
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm20
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm18, %zmm20
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm24, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 960(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqa64 896(%rdi), %zmm12
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm29
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm15, %zmm29
-; AVX512F-NEXT:    vmovdqa64 1408(%rdi), %zmm3
-; AVX512F-NEXT:    vmovdqa64 1344(%rdi), %zmm5
-; AVX512F-NEXT:    vpermi2q %zmm3, %zmm5, %zmm15
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm8[0,1,4,5],zmm13[4,5,0,1]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm9
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm0, %zmm1
-; AVX512F-NEXT:    vpermi2q %zmm13, %zmm8, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm18, %zmm19
-; AVX512F-NEXT:    vpermi2q %zmm8, %zmm13, %zmm18
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm24, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm6[0,1,4,5],zmm9[4,5,0,1]
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm24, %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa 912(%rdi), %xmm13
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3]
-; AVX512F-NEXT:    vinserti32x4 $0, %xmm13, %zmm1, %zmm30
-; AVX512F-NEXT:    vmovdqa 464(%rdi), %xmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
-; AVX512F-NEXT:    vinserti32x4 $0, %xmm1, %zmm28, %zmm28
-; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
-; AVX512F-NEXT:    vinserti32x4 $0, %xmm1, %zmm27, %zmm27
-; AVX512F-NEXT:    vmovdqa 1360(%rdi), %xmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
-; AVX512F-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm24
-; AVX512F-NEXT:    vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa 1024(%rdi), %ymm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,11]
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm0, %zmm6
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm11, %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm8
-; AVX512F-NEXT:    vmovdqa 576(%rdi), %ymm6
-; AVX512F-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm0, %zmm11
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm8
-; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm6
-; AVX512F-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm9
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm0, %zmm9
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm23, %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vinserti128 $1, 1536(%rdi), %ymm0, %ymm6
-; AVX512F-NEXT:    vmovdqa 1472(%rdi), %ymm9
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7]
-; AVX512F-NEXT:    vpermi2q %zmm3, %zmm5, %zmm0
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm26, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa 1088(%rdi), %ymm0
-; AVX512F-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u>
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm23
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm26, %zmm23
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm0 = [5,12]
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm0, %zmm6
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm8 = [6,13]
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm8, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm26, %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm8, %zmm14
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm26, %zmm2
-; AVX512F-NEXT:    vpermi2q %zmm5, %zmm3, %zmm26
-; AVX512F-NEXT:    vmovdqa 960(%rdi), %ymm11
-; AVX512F-NEXT:    vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vextracti128 $1, %ymm11, %xmm11
-; AVX512F-NEXT:    vmovdqa 512(%rdi), %ymm13
-; AVX512F-NEXT:    vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vextracti128 $1, %ymm13, %xmm13
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm0, %zmm10
-; AVX512F-NEXT:    vpermi2q %zmm3, %zmm5, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm8, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm8, %zmm12
-; AVX512F-NEXT:    movb $-32, %al
-; AVX512F-NEXT:    kmovw %eax, %k2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm16 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm17 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm29 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm15 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm2 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm2 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm23 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm23 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm7 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm7 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm26 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm26 {%k2}
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm30 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm28 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm27 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm24 {%k2}
-; AVX512F-NEXT:    vinserti32x4 $0, %xmm11, %zmm19, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm3 {%k2}
-; AVX512F-NEXT:    vinserti32x4 $0, %xmm13, %zmm22, %zmm4
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm4 {%k2}
-; AVX512F-NEXT:    vpblendd $240, (%rsp), %ymm10, %ymm8 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm8 = ymm10[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa 64(%rdi), %ymm10
-; AVX512F-NEXT:    vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vextracti128 $1, %ymm10, %xmm10
-; AVX512F-NEXT:    vinserti32x4 $0, %xmm10, %zmm20, %zmm10
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm10 {%k2}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm25, %zmm8
-; AVX512F-NEXT:    vmovdqa 640(%rdi), %ymm11
-; AVX512F-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vmovdqa 1408(%rdi), %ymm13
-; AVX512F-NEXT:    vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vextracti128 $1, %ymm13, %xmm13
-; AVX512F-NEXT:    vinserti32x4 $0, %xmm13, %zmm18, %zmm13
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm13 {%k2}
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm11, %zmm6
-; AVX512F-NEXT:    vmovdqa 192(%rdi), %ymm11
-; AVX512F-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm1
-; AVX512F-NEXT:    vmovdqa 1536(%rdi), %ymm11
-; AVX512F-NEXT:    vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm9 = ymm9[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vinsertf64x4 $0, %ymm9, %zmm11, %zmm9
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm11 = ymm14[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm11, %zmm14, %zmm11
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm14, %zmm5
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm12, %zmm14, %zmm12
-; AVX512F-NEXT:    vmovdqa64 %zmm15, 192(%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm29, 128(%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm17, 64(%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm16, (%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm26, 192(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm7, (%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm23, 64(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm2, 128(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm24, 192(%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm27, (%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm28, 64(%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm30, 128(%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm13, 192(%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm10, (%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm4, 64(%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm3, 128(%r8)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm2, 192(%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm2, (%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm2, 64(%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm2, 128(%r9)
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm0, 192(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm1, (%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm6, 64(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm8, 128(%rax)
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm12, 128(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm5, 192(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm11, (%rax)
-; AVX512F-NEXT:    vmovaps %zmm9, 64(%rax)
-; AVX512F-NEXT:    addq $2152, %rsp # imm = 0x868
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
+; AVX512F-ONLY-SLOW-LABEL: load_i64_stride7_vf32:
+; AVX512F-ONLY-SLOW:       # %bb.0:
+; AVX512F-ONLY-SLOW-NEXT:    subq $2152, %rsp # imm = 0x868
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1664(%rdi), %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1600(%rdi), %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1216(%rdi), %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1152(%rdi), %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1088(%rdi), %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps 1024(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 768(%rdi), %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 704(%rdi), %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 640(%rdi), %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps 576(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [10,3,10,3,10,3,10,3]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm4, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm4, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm4, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm17, %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm4, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [13,6,13,6,13,6,13,6]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm11, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm11, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm22, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm22, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm4, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm4, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm30
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm26, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm26, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm20, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm20, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm1, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm4, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm4, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm4, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm17, %zmm0, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1536(%rdi), %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1472(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm4, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm22, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm17, %zmm0, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm31
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm4, %zmm31
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm17, %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm26, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm17, %zmm0, %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm20, %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm17, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm1, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 832(%rdi), %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1280(%rdi), %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1728(%rdi), %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm31
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm30
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    movb $24, %al
+; AVX512F-ONLY-SLOW-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = <0,7,14,u>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm18, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm24 = [4,11,4,11]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm24 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm24, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm15, %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[0,1,4,5],zmm29[4,5,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm0, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm18, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm24, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 960(%rdi), %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 896(%rdi), %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm15, %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1408(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1344(%rdi), %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm5, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm8[0,1,4,5],zmm13[4,5,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm13, %zmm8, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm18, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm8, %zmm13, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm24, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm6[0,1,4,5],zmm9[4,5,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm24, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 912(%rdi), %xmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $0, %xmm13, %zmm1, %zmm30
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 464(%rdi), %xmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $0, %xmm1, %zmm28, %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $0, %xmm1, %zmm27, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 1360(%rdi), %xmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 1024(%rdi), %ymm1
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,11]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm11, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 576(%rdi), %ymm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rdi), %ymm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm23, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, 1536(%rdi), %ymm0, %ymm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 1472(%rdi), %ymm9
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 1088(%rdi), %ymm0
+; AVX512F-ONLY-SLOW-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm26, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = [5,12]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = [6,13]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm8, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm26, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm8, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm26, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm5, %zmm3, %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 960(%rdi), %ymm11
+; AVX512F-ONLY-SLOW-NEXT:    vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-SLOW-NEXT:    vextracti128 $1, %ymm11, %xmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 512(%rdi), %ymm13
+; AVX512F-ONLY-SLOW-NEXT:    vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-SLOW-NEXT:    vextracti128 $1, %ymm13, %xmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm8, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm8, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    movb $-32, %al
+; AVX512F-ONLY-SLOW-NEXT:    kmovw %eax, %k2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm16 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm17 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm29 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm15 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm2 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm2 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm23 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm23 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm7 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm7 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm26 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm26 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm30 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm28 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm27 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm24 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $0, %xmm11, %zmm19, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm3 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $0, %xmm13, %zmm22, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm4 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, (%rsp), %ymm10, %ymm8 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm8 = ymm10[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm10
+; AVX512F-ONLY-SLOW-NEXT:    vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-SLOW-NEXT:    vextracti128 $1, %ymm10, %xmm10
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $0, %xmm10, %zmm20, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm10 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm25, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 640(%rdi), %ymm11
+; AVX512F-ONLY-SLOW-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 1408(%rdi), %ymm13
+; AVX512F-ONLY-SLOW-NEXT:    vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-SLOW-NEXT:    vextracti128 $1, %ymm13, %xmm13
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $0, %xmm13, %zmm18, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm13 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm11, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm11
+; AVX512F-ONLY-SLOW-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 1536(%rdi), %ymm11
+; AVX512F-ONLY-SLOW-NEXT:    vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm9 = ymm9[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinsertf64x4 $0, %ymm9, %zmm11, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm11 = ymm14[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm14, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm14, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm14, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, 192(%rsi)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, 128(%rsi)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, 64(%rsi)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, (%rsi)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, 192(%rdx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, (%rdx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, 64(%rdx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 128(%rdx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, 192(%rcx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, (%rcx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, 64(%rcx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, 128(%rcx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, 192(%r8)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, (%r8)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 64(%r8)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 128(%r8)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm2, 192(%r9)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm2, (%r9)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm2, 64(%r9)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm2, 128(%r9)
+; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, 128(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, 128(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, (%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm9, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    addq $2152, %rsp # imm = 0x868
+; AVX512F-ONLY-SLOW-NEXT:    vzeroupper
+; AVX512F-ONLY-SLOW-NEXT:    retq
 ;
-; AVX512BW-LABEL: load_i64_stride7_vf32:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    subq $2120, %rsp # imm = 0x848
-; AVX512BW-NEXT:    vmovdqa64 1664(%rdi), %zmm15
-; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %zmm31
-; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %zmm30
-; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm21
-; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm11
-; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm12
-; AVX512BW-NEXT:    vmovaps 576(%rdi), %zmm0
-; AVX512BW-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm9
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3]
-; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm6, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm6, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm15, %zmm31, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [11,4,11,4,11,4,11,4]
-; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm6, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm8
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [13,6,13,6,13,6,13,6]
-; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm8, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm17
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm8, %zmm17
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5]
-; AVX512BW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm19, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm24
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm16
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm19, %zmm16
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm6, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm27
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm28
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0]
-; AVX512BW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm26, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm23
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm26, %zmm23
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0]
-; AVX512BW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm29, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm22
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
-; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm29, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm9
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm6, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm6, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm7
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm6, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm21
-; AVX512BW-NEXT:    vpermi2q %zmm15, %zmm31, %zmm21
-; AVX512BW-NEXT:    vmovdqa64 1536(%rdi), %zmm13
-; AVX512BW-NEXT:    vmovdqa64 1472(%rdi), %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm6, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm25
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm19, %zmm25
-; AVX512BW-NEXT:    vpermi2q %zmm15, %zmm31, %zmm19
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm31
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm6, %zmm31
-; AVX512BW-NEXT:    vpermi2q %zmm15, %zmm4, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm18
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm26, %zmm18
-; AVX512BW-NEXT:    vpermi2q %zmm15, %zmm4, %zmm26
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm20
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm29, %zmm20
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm15, %zmm29
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm1, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm1
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9]
-; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1280(%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1728(%rdi), %zmm6
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10]
-; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm17
-; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm21
-; AVX512BW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11]
-; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm25
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm24
-; AVX512BW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm16
-; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm19
-; AVX512BW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12]
-; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm31
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm27
-; AVX512BW-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm28
-; AVX512BW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13]
-; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm18
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm23
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm26
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14]
-; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm20
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm22
-; AVX512BW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm29
-; AVX512BW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15]
-; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm11
-; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm9
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm15
-; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    movb $24, %al
-; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm15 = <0,7,14,u>
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm17
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm15, %zmm17
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1]
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm28
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm28
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0]
-; AVX512BW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm19
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm16, %zmm19
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm24 = <u,u,4,11>
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm24, %zmm14
-; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm12
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm14
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm15, %zmm14
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[0,1,4,5],zmm7[4,5,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm9
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm21
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm16, %zmm21
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm24, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm11
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm29
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm15, %zmm29
-; AVX512BW-NEXT:    vmovdqa64 1408(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 1344(%rdi), %zmm7
-; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm7, %zmm15
-; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm6[0,1,4,5],zmm13[4,5,0,1]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpermi2q %zmm13, %zmm6, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm22
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm16, %zmm22
-; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm13, %zmm16
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm24, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm10[0,1,4,5],zmm30[4,5,0,1]
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm24, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa 912(%rdi), %xmm13
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3]
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm13, %zmm1, %zmm30
-; AVX512BW-NEXT:    vmovdqa 464(%rdi), %xmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm1, %zmm28, %zmm28
-; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm1, %zmm9, %zmm27
-; AVX512BW-NEXT:    vmovdqa 1360(%rdi), %xmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm24
-; AVX512BW-NEXT:    vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
-; AVX512BW-NEXT:    vmovdqa 1024(%rdi), %ymm13
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,11]
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm6
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm18, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm1
-; AVX512BW-NEXT:    vmovdqa 576(%rdi), %ymm6
-; AVX512BW-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm6
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm1
-; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm6
-; AVX512BW-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm8
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm23, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vinserti128 $1, 1536(%rdi), %ymm0, %ymm1
-; AVX512BW-NEXT:    vmovdqa 1472(%rdi), %ymm9
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm7, %zmm0
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm26, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u>
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm23
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm26, %zmm23
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm0 = [5,12]
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm8
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,13]
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm26, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm26, %zmm3
-; AVX512BW-NEXT:    vpermi2q %zmm7, %zmm2, %zmm26
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm10
-; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm7, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm1, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm11
-; AVX512BW-NEXT:    vmovdqa 1088(%rdi), %ymm1
-; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm20, %zmm18
-; AVX512BW-NEXT:    movb $-32, %al
-; AVX512BW-NEXT:    kmovd %eax, %k2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm14 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm17 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm29 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm15 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm23 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm23 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm26 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm26 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm30 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm28 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm24 {%k2}
-; AVX512BW-NEXT:    vmovdqa 960(%rdi), %ymm2
-; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm2, %zmm22, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm2 {%k2}
-; AVX512BW-NEXT:    vmovdqa 512(%rdi), %ymm4
-; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm4, %zmm19, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k2}
-; AVX512BW-NEXT:    vmovdqa 64(%rdi), %ymm10
-; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm10, %xmm10
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm10, %zmm21, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm10 {%k2}
-; AVX512BW-NEXT:    vmovdqa 1408(%rdi), %ymm13
-; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm13, %xmm13
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm13, %zmm16, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm13 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %ymm16
-; AVX512BW-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm16[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm16[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
-; AVX512BW-NEXT:    vmovdqa 192(%rdi), %ymm8
-; AVX512BW-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
-; AVX512BW-NEXT:    vmovdqa 1536(%rdi), %ymm8
-; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm0
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vinsertf64x4 $0, %ymm8, %zmm9, %zmm8
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm12, %zmm9
-; AVX512BW-NEXT:    vpblendd $240, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm12, %zmm7
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm12, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, 192(%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, 128(%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, 64(%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, (%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, 192(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, (%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, 64(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, 192(%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, (%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, 64(%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, 128(%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, 192(%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, (%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, 64(%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, 128(%r8)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm2, 192(%r9)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm2, (%r9)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm2, 64(%r9)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm2, 128(%r9)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, 192(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, (%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, 128(%rax)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, 128(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, 192(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, (%rax)
-; AVX512BW-NEXT:    vmovaps %zmm8, 64(%rax)
-; AVX512BW-NEXT:    addq $2120, %rsp # imm = 0x848
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX512F-ONLY-FAST-LABEL: load_i64_stride7_vf32:
+; AVX512F-ONLY-FAST:       # %bb.0:
+; AVX512F-ONLY-FAST-NEXT:    subq $2152, %rsp # imm = 0x868
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1664(%rdi), %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1600(%rdi), %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1216(%rdi), %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1152(%rdi), %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1088(%rdi), %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovaps 1024(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 768(%rdi), %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 704(%rdi), %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 640(%rdi), %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vmovaps 576(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [10,3,10,3,10,3,10,3]
+; AVX512F-ONLY-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm4, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm4, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm4, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm17, %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4]
+; AVX512F-ONLY-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm4, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [13,6,13,6,13,6,13,6]
+; AVX512F-ONLY-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm11, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm11, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
+; AVX512F-ONLY-FAST-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm22, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm22, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm4, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm4, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0]
+; AVX512F-ONLY-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm26, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm26, %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0]
+; AVX512F-ONLY-FAST-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm20, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
+; AVX512F-ONLY-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm20, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm1, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm4, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm4, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm4, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm17, %zmm0, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1536(%rdi), %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1472(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm4, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm22, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm17, %zmm0, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm31
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm4, %zmm31
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm17, %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm26, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm17, %zmm0, %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm20, %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm17, %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm1, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9]
+; AVX512F-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 832(%rdi), %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm3, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1280(%rdi), %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1728(%rdi), %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10]
+; AVX512F-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm3, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11]
+; AVX512F-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm3, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm3, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12]
+; AVX512F-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm31
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm3, %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm3, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13]
+; AVX512F-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm3, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm3, %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14]
+; AVX512F-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm3, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm3, %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15]
+; AVX512F-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm3, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm3, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    movb $24, %al
+; AVX512F-ONLY-FAST-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <0,7,14,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
+; AVX512F-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm0, %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0]
+; AVX512F-ONLY-FAST-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm18, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm24 = [4,11,4,11]
+; AVX512F-ONLY-FAST-NEXT:    # ymm24 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm24, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm15, %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[0,1,4,5],zmm29[4,5,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm0, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm18, %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm24, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 960(%rdi), %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 896(%rdi), %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm15, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1408(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1344(%rdi), %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm5, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm8[0,1,4,5],zmm13[4,5,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm13, %zmm8, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm18, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm8, %zmm13, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm24, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm6[0,1,4,5],zmm9[4,5,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm24, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 912(%rdi), %xmm13
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $0, %xmm13, %zmm1, %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 464(%rdi), %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $0, %xmm1, %zmm28, %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $0, %xmm1, %zmm27, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 1360(%rdi), %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 1024(%rdi), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,11]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm11, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 576(%rdi), %ymm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm0, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rdi), %ymm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm0, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm23, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, 1536(%rdi), %ymm0, %ymm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 1472(%rdi), %ymm9
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 1088(%rdi), %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm26, %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = [5,12]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm0, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = [6,13]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm8, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm26, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm8, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm26, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm5, %zmm3, %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 960(%rdi), %ymm11
+; AVX512F-ONLY-FAST-NEXT:    vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm11, %xmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 512(%rdi), %ymm13
+; AVX512F-ONLY-FAST-NEXT:    vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm13, %xmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm8, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm8, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    movb $-32, %al
+; AVX512F-ONLY-FAST-NEXT:    kmovw %eax, %k2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm16 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm17 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm29 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm15 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm2 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm2 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm23 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm23 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm7 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm7 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm26 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm26 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm30 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm28 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm27 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm24 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $0, %xmm11, %zmm19, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm3 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $0, %xmm13, %zmm22, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm4 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, (%rsp), %ymm10, %ymm8 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm8 = ymm10[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rdi), %ymm10
+; AVX512F-ONLY-FAST-NEXT:    vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm10, %xmm10
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $0, %xmm10, %zmm20, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm10 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm25, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 640(%rdi), %ymm11
+; AVX512F-ONLY-FAST-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 1408(%rdi), %ymm13
+; AVX512F-ONLY-FAST-NEXT:    vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm13, %xmm13
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $0, %xmm13, %zmm18, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm13 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm11, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rdi), %ymm11
+; AVX512F-ONLY-FAST-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 1536(%rdi), %ymm11
+; AVX512F-ONLY-FAST-NEXT:    vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm9 = ymm9[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinsertf64x4 $0, %ymm9, %zmm11, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm11 = ymm14[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm14, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm14, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm14, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, 192(%rsi)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, 128(%rsi)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, 64(%rsi)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, (%rsi)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, 192(%rdx)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, (%rdx)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, 64(%rdx)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 128(%rdx)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, 192(%rcx)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, (%rcx)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, 64(%rcx)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, 128(%rcx)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, 192(%r8)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, (%r8)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 64(%r8)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 128(%r8)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm2, 192(%r9)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm2, (%r9)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm2, 64(%r9)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm2, 128(%r9)
+; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, (%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm9, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT:    addq $2152, %rsp # imm = 0x868
+; AVX512F-ONLY-FAST-NEXT:    vzeroupper
+; AVX512F-ONLY-FAST-NEXT:    retq
+;
+; AVX512DQ-SLOW-LABEL: load_i64_stride7_vf32:
+; AVX512DQ-SLOW:       # %bb.0:
+; AVX512DQ-SLOW-NEXT:    subq $2152, %rsp # imm = 0x868
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1664(%rdi), %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1600(%rdi), %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1216(%rdi), %zmm16
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1152(%rdi), %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1088(%rdi), %zmm19
+; AVX512DQ-SLOW-NEXT:    vmovaps 1024(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 768(%rdi), %zmm21
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 704(%rdi), %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 640(%rdi), %zmm12
+; AVX512DQ-SLOW-NEXT:    vmovaps 576(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm18
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [10,3,10,3,10,3,10,3]
+; AVX512DQ-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm4, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm4, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm4, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm17, %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4]
+; AVX512DQ-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm4, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm11
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [13,6,13,6,13,6,13,6]
+; AVX512DQ-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm11, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm11, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
+; AVX512DQ-SLOW-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm22, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm24
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm13
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm22, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm27
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm4, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm28
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm13
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm4, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm30
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0]
+; AVX512DQ-SLOW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm26, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm23
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm26, %zmm23
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0]
+; AVX512DQ-SLOW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm15
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm20, %zmm15
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
+; AVX512DQ-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm20, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm1, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm4, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm4, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm29
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm4, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm18
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm17, %zmm0, %zmm18
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1536(%rdi), %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1472(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm4, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm21
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm22, %zmm21
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm17, %zmm0, %zmm22
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm31
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm4, %zmm31
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm17, %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm10
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm26, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm17, %zmm0, %zmm26
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm25
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm20, %zmm25
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm17, %zmm20
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm17
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm1, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9]
+; AVX512DQ-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 832(%rdi), %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1280(%rdi), %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1728(%rdi), %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10]
+; AVX512DQ-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm18
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11]
+; AVX512DQ-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm21
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm24
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm27
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm22
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12]
+; AVX512DQ-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm31
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm28
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm30
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm10
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13]
+; AVX512DQ-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm23
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm26
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14]
+; AVX512DQ-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm25
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm20
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15]
+; AVX512DQ-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    movb $24, %al
+; AVX512DQ-SLOW-NEXT:    kmovw %eax, %k1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = <0,7,14,u>
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm17
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm17
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1]
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
+; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm28
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm28
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0]
+; AVX512DQ-SLOW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm22
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm14, %zmm18, %zmm22
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm24 = [4,11,4,11]
+; AVX512DQ-SLOW-NEXT:    # ymm24 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm24, %zmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm16
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm15, %zmm16
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[0,1,4,5],zmm29[4,5,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm27
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm0, %zmm27
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm20
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm18, %zmm20
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm24, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 960(%rdi), %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 896(%rdi), %zmm12
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm29
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm15, %zmm29
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1408(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1344(%rdi), %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm5, %zmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm8[0,1,4,5],zmm13[4,5,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm9
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm13, %zmm8, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm18, %zmm19
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm8, %zmm13, %zmm18
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm24, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm6[0,1,4,5],zmm9[4,5,0,1]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm24, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa 912(%rdi), %xmm13
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $0, %xmm13, %zmm1, %zmm30
+; AVX512DQ-SLOW-NEXT:    vmovdqa 464(%rdi), %xmm1
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $0, %xmm1, %zmm28, %zmm28
+; AVX512DQ-SLOW-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $0, %xmm1, %zmm27, %zmm27
+; AVX512DQ-SLOW-NEXT:    vmovdqa 1360(%rdi), %xmm1
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm24
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa 1024(%rdi), %ymm1
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,11]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm11, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa 576(%rdi), %ymm6
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rdi), %ymm6
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm9
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm9
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm23, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, 1536(%rdi), %ymm0, %ymm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa 1472(%rdi), %ymm9
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa 1088(%rdi), %ymm0
+; AVX512DQ-SLOW-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u>
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm23
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm26, %zmm23
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = [5,12]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = [6,13]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm8, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm14, %zmm26, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm8, %zmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm26, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm5, %zmm3, %zmm26
+; AVX512DQ-SLOW-NEXT:    vmovdqa 960(%rdi), %ymm11
+; AVX512DQ-SLOW-NEXT:    vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512DQ-SLOW-NEXT:    vextracti128 $1, %ymm11, %xmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqa 512(%rdi), %ymm13
+; AVX512DQ-SLOW-NEXT:    vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
+; AVX512DQ-SLOW-NEXT:    vextracti128 $1, %ymm13, %xmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm8, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm8, %zmm12
+; AVX512DQ-SLOW-NEXT:    movb $-32, %al
+; AVX512DQ-SLOW-NEXT:    kmovw %eax, %k2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm16 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm17 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm29 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm15 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm2 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm2 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm23 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm23 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm7 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm7 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm26 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm26 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm30 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm28 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm27 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm24 {%k2}
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $0, %xmm11, %zmm19, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm3 {%k2}
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $0, %xmm13, %zmm22, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm4 {%k2}
+; AVX512DQ-SLOW-NEXT:    vpblendd $240, (%rsp), %ymm10, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm8 = ymm10[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm10
+; AVX512DQ-SLOW-NEXT:    vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
+; AVX512DQ-SLOW-NEXT:    vextracti128 $1, %ymm10, %xmm10
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $0, %xmm10, %zmm20, %zmm10
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm10 {%k2}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm25, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa 640(%rdi), %ymm11
+; AVX512DQ-SLOW-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512DQ-SLOW-NEXT:    vmovdqa 1408(%rdi), %ymm13
+; AVX512DQ-SLOW-NEXT:    vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
+; AVX512DQ-SLOW-NEXT:    vextracti128 $1, %ymm13, %xmm13
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $0, %xmm13, %zmm18, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm13 {%k2}
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm11, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm11
+; AVX512DQ-SLOW-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa 1536(%rdi), %ymm11
+; AVX512DQ-SLOW-NEXT:    vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm9 = ymm9[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vinsertf64x4 $0, %ymm9, %zmm11, %zmm9
+; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm11 = ymm14[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm14, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm14, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm14, %zmm12
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, 192(%rsi)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, 128(%rsi)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, 64(%rsi)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, (%rsi)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, 192(%rdx)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, (%rdx)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, 64(%rdx)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, 128(%rdx)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, 192(%rcx)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm27, (%rcx)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, 64(%rcx)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, 128(%rcx)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, 192(%r8)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, (%r8)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, 64(%r8)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, 128(%r8)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm2, 192(%r9)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm2, (%r9)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm2, 64(%r9)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm2, 128(%r9)
+; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, 192(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, 64(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, 128(%rax)
+; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, 128(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, 192(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, (%rax)
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm9, 64(%rax)
+; AVX512DQ-SLOW-NEXT:    addq $2152, %rsp # imm = 0x868
+; AVX512DQ-SLOW-NEXT:    vzeroupper
+; AVX512DQ-SLOW-NEXT:    retq
+;
+; AVX512DQ-FAST-LABEL: load_i64_stride7_vf32:
+; AVX512DQ-FAST:       # %bb.0:
+; AVX512DQ-FAST-NEXT:    subq $2152, %rsp # imm = 0x868
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1664(%rdi), %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1600(%rdi), %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1216(%rdi), %zmm16
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1152(%rdi), %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1088(%rdi), %zmm19
+; AVX512DQ-FAST-NEXT:    vmovaps 1024(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 768(%rdi), %zmm21
+; AVX512DQ-FAST-NEXT:    vmovdqa64 704(%rdi), %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 640(%rdi), %zmm12
+; AVX512DQ-FAST-NEXT:    vmovaps 576(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT:    vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm18
+; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [10,3,10,3,10,3,10,3]
+; AVX512DQ-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm4, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm4, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm4, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm17, %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4]
+; AVX512DQ-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm4, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm11
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [13,6,13,6,13,6,13,6]
+; AVX512DQ-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm11, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm11, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
+; AVX512DQ-FAST-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm22, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm24
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm13
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm22, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm27
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm4, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm28
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm13
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm4, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm30
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0]
+; AVX512DQ-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm26, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm23
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm26, %zmm23
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0]
+; AVX512DQ-FAST-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm15
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm20, %zmm15
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
+; AVX512DQ-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm20, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm1, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm4, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm4, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm29
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm4, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm18
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm17, %zmm0, %zmm18
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1536(%rdi), %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1472(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm4, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm21
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm22, %zmm21
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm17, %zmm0, %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm31
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm4, %zmm31
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm17, %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm10
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm11
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm26, %zmm11
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm17, %zmm0, %zmm26
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm25
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm20, %zmm25
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm17, %zmm20
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm17
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm1, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9]
+; AVX512DQ-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 832(%rdi), %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm3, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1280(%rdi), %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1728(%rdi), %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10]
+; AVX512DQ-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm3, %zmm18
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11]
+; AVX512DQ-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm21
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm3, %zmm24
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm27
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm3, %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12]
+; AVX512DQ-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm31
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm3, %zmm28
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm30
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm3, %zmm10
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13]
+; AVX512DQ-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm3, %zmm10
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm23
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm3, %zmm26
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14]
+; AVX512DQ-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm25
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm3, %zmm15
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm3, %zmm20
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15]
+; AVX512DQ-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm3, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm3, %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    movb $24, %al
+; AVX512DQ-FAST-NEXT:    kmovw %eax, %k1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <0,7,14,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm17
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm17
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1]
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
+; AVX512DQ-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, %zmm28
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm0, %zmm28
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0]
+; AVX512DQ-FAST-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm22
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm14, %zmm18, %zmm22
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm24 = [4,11,4,11]
+; AVX512DQ-FAST-NEXT:    # ymm24 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm24, %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdi), %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, %zmm16
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm15, %zmm16
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[0,1,4,5],zmm29[4,5,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm27
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm0, %zmm27
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, %zmm20
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm18, %zmm20
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm24, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 960(%rdi), %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 896(%rdi), %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm29
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm15, %zmm29
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1408(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1344(%rdi), %zmm5
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm5, %zmm15
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm8[0,1,4,5],zmm13[4,5,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm9
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm13, %zmm8, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm18, %zmm19
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm8, %zmm13, %zmm18
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm24, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm6[0,1,4,5],zmm9[4,5,0,1]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm24, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa 912(%rdi), %xmm13
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $0, %xmm13, %zmm1, %zmm30
+; AVX512DQ-FAST-NEXT:    vmovdqa 464(%rdi), %xmm1
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $0, %xmm1, %zmm28, %zmm28
+; AVX512DQ-FAST-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $0, %xmm1, %zmm27, %zmm27
+; AVX512DQ-FAST-NEXT:    vmovdqa 1360(%rdi), %xmm1
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm24
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqa 1024(%rdi), %ymm1
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,11]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm6
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm11, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm8
+; AVX512DQ-FAST-NEXT:    vmovdqa 576(%rdi), %ymm6
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm11
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm0, %zmm11
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm8
+; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rdi), %ymm6
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, %zmm9
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm0, %zmm9
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm23, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, 1536(%rdi), %ymm0, %ymm6
+; AVX512DQ-FAST-NEXT:    vmovdqa 1472(%rdi), %ymm9
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa 1088(%rdi), %ymm0
+; AVX512DQ-FAST-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm23
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm26, %zmm23
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = [5,12]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm0, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = [6,13]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm8, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm14, %zmm26, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm8, %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm26, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm5, %zmm3, %zmm26
+; AVX512DQ-FAST-NEXT:    vmovdqa 960(%rdi), %ymm11
+; AVX512DQ-FAST-NEXT:    vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm11, %xmm11
+; AVX512DQ-FAST-NEXT:    vmovdqa 512(%rdi), %ymm13
+; AVX512DQ-FAST-NEXT:    vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
+; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm13, %xmm13
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm10
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm10
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm8, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm8, %zmm12
+; AVX512DQ-FAST-NEXT:    movb $-32, %al
+; AVX512DQ-FAST-NEXT:    kmovw %eax, %k2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm16 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm17 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm29 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm15 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm2 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm2 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm23 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm23 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm7 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm7 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm26 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm26 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm30 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm28 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm27 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm24 {%k2}
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $0, %xmm11, %zmm19, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm3 {%k2}
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $0, %xmm13, %zmm22, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm4 {%k2}
+; AVX512DQ-FAST-NEXT:    vpblendd $240, (%rsp), %ymm10, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm8 = ymm10[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rdi), %ymm10
+; AVX512DQ-FAST-NEXT:    vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
+; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm10, %xmm10
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $0, %xmm10, %zmm20, %zmm10
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm10 {%k2}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm25, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa 640(%rdi), %ymm11
+; AVX512DQ-FAST-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512DQ-FAST-NEXT:    vmovdqa 1408(%rdi), %ymm13
+; AVX512DQ-FAST-NEXT:    vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
+; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm13, %xmm13
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $0, %xmm13, %zmm18, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm13 {%k2}
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm11, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rdi), %ymm11
+; AVX512DQ-FAST-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa 1536(%rdi), %ymm11
+; AVX512DQ-FAST-NEXT:    vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm9 = ymm9[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinsertf64x4 $0, %ymm9, %zmm11, %zmm9
+; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm11 = ymm14[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm14, %zmm11
+; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm14, %zmm5
+; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm14, %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm15, 192(%rsi)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, 128(%rsi)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, 64(%rsi)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, (%rsi)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, 192(%rdx)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, (%rdx)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, 64(%rdx)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, 128(%rdx)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, 192(%rcx)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, (%rcx)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, 64(%rcx)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, 128(%rcx)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, 192(%r8)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, (%r8)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, 64(%r8)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, 128(%r8)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm2, 192(%r9)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm2, (%r9)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm2, 64(%r9)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm2, 128(%r9)
+; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, 192(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, 64(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, 128(%rax)
+; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, 128(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, 192(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, (%rax)
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm9, 64(%rax)
+; AVX512DQ-FAST-NEXT:    addq $2152, %rsp # imm = 0x868
+; AVX512DQ-FAST-NEXT:    vzeroupper
+; AVX512DQ-FAST-NEXT:    retq
+;
+; AVX512BW-ONLY-SLOW-LABEL: load_i64_stride7_vf32:
+; AVX512BW-ONLY-SLOW:       # %bb.0:
+; AVX512BW-ONLY-SLOW-NEXT:    subq $2120, %rsp # imm = 0x848
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1664(%rdi), %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1600(%rdi), %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1216(%rdi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1152(%rdi), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1088(%rdi), %zmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1024(%rdi), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 768(%rdi), %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 704(%rdi), %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 640(%rdi), %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps 576(%rdi), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm6, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm6, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm15, %zmm31, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [11,4,11,4,11,4,11,4]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm6, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [13,6,13,6,13,6,13,6]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm8, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm8, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm19, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm19, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm6, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm26, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm26, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm29, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm29, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm15, %zmm31, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1536(%rdi), %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1472(%rdi), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm19, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm15, %zmm31, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm6, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm15, %zmm4, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm26, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm15, %zmm4, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm29, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm15, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm1, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 832(%rdi), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1280(%rdi), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1728(%rdi), %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    movb $24, %al
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %eax, %k1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 512(%rdi), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = <0,7,14,u>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm15, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm16, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm24 = [4,11,4,11]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm24 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm24, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm15, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[0,1,4,5],zmm7[4,5,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm16, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm24, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 960(%rdi), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 896(%rdi), %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm15, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1408(%rdi), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1344(%rdi), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm7, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm6[0,1,4,5],zmm13[4,5,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm13, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm16, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm13, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm24, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm10[0,1,4,5],zmm30[4,5,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm24, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 912(%rdi), %xmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $0, %xmm13, %zmm1, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 464(%rdi), %xmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $0, %xmm1, %zmm28, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $0, %xmm1, %zmm9, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 1360(%rdi), %xmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 1024(%rdi), %ymm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,11]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm18, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 576(%rdi), %ymm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rdi), %ymm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm23, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, 1536(%rdi), %ymm0, %ymm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 1472(%rdi), %ymm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm7, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm26, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = [5,12]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,13]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm26, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm26, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm2, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm7, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm1, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 1088(%rdi), %ymm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm20, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    movb $-32, %al
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %eax, %k2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm14 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm17 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm29 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm15 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm23 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm23 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm26 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm26 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm30 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm28 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm24 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 960(%rdi), %ymm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-SLOW-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $0, %xmm2, %zmm22, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm2 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 512(%rdi), %ymm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-SLOW-NEXT:    vextracti128 $1, %ymm4, %xmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $0, %xmm4, %zmm19, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-SLOW-NEXT:    vextracti128 $1, %ymm10, %xmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $0, %xmm10, %zmm21, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm10 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 1408(%rdi), %ymm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-SLOW-NEXT:    vextracti128 $1, %ymm13, %xmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $0, %xmm13, %zmm16, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm13 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 640(%rdi), %ymm16
+; AVX512BW-ONLY-SLOW-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm16[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm16[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 1536(%rdi), %ymm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinsertf64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm12, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm12, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm12, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, 192(%rsi)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, 128(%rsi)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, 64(%rsi)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, (%rsi)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, 192(%rdx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, (%rdx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, 64(%rdx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, 192(%rcx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, (%rcx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, 64(%rcx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, 128(%rcx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, 192(%r8)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, (%r8)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 64(%r8)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 128(%r8)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm2, 192(%r9)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm2, (%r9)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm2, 64(%r9)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm2, 128(%r9)
+; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm8, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    addq $2120, %rsp # imm = 0x848
+; AVX512BW-ONLY-SLOW-NEXT:    vzeroupper
+; AVX512BW-ONLY-SLOW-NEXT:    retq
+;
+; AVX512BW-ONLY-FAST-LABEL: load_i64_stride7_vf32:
+; AVX512BW-ONLY-FAST:       # %bb.0:
+; AVX512BW-ONLY-FAST-NEXT:    subq $2120, %rsp # imm = 0x848
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1664(%rdi), %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1600(%rdi), %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1216(%rdi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1152(%rdi), %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1088(%rdi), %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1024(%rdi), %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 768(%rdi), %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 704(%rdi), %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 640(%rdi), %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps 576(%rdi), %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm6, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm6, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm6, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm15, %zmm31, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [11,4,11,4,11,4,11,4]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm6, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [13,6,13,6,13,6,13,6]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm8, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm8, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm19, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm19, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm6, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm6, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm26, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm26, %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm29, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm29, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm15, %zmm31, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1536(%rdi), %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1472(%rdi), %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm19, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm15, %zmm31, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm6, %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm15, %zmm4, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm26, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm15, %zmm4, %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm29, %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm15, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm1, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm1, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 832(%rdi), %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm3, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1280(%rdi), %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1728(%rdi), %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm3, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm3, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm3, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm3, %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm3, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm3, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm3, %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm3, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm3, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm3, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm3, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    movb $24, %al
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %eax, %k1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 512(%rdi), %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <0,7,14,u>
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm15, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm0, %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm16, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm24 = [4,11,4,11]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm24 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm24, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm15, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[0,1,4,5],zmm7[4,5,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm0, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm16, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm24, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 960(%rdi), %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 896(%rdi), %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm15, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1408(%rdi), %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1344(%rdi), %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm7, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm6[0,1,4,5],zmm13[4,5,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm13, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm16, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm6, %zmm13, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm24, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm10[0,1,4,5],zmm30[4,5,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm24, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 912(%rdi), %xmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $0, %xmm13, %zmm1, %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 464(%rdi), %xmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $0, %xmm1, %zmm28, %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $0, %xmm1, %zmm9, %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 1360(%rdi), %xmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 1024(%rdi), %ymm13
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,11]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm18, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 576(%rdi), %ymm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rdi), %ymm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm0, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm23, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, 1536(%rdi), %ymm0, %ymm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 1472(%rdi), %ymm9
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm7, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u>
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm26, %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = [5,12]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,13]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm1, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm26, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm0, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm1, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm26, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm7, %zmm2, %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm7, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm1, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 1088(%rdi), %ymm1
+; AVX512BW-ONLY-FAST-NEXT:    vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm20, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    movb $-32, %al
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %eax, %k2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm14 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm17 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm29 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm15 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm23 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm23 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm26 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm26 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm30 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm28 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm24 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 960(%rdi), %ymm2
+; AVX512BW-ONLY-FAST-NEXT:    vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-FAST-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $0, %xmm2, %zmm22, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 512(%rdi), %ymm4
+; AVX512BW-ONLY-FAST-NEXT:    vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-FAST-NEXT:    vextracti128 $1, %ymm4, %xmm4
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $0, %xmm4, %zmm19, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rdi), %ymm10
+; AVX512BW-ONLY-FAST-NEXT:    vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-FAST-NEXT:    vextracti128 $1, %ymm10, %xmm10
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $0, %xmm10, %zmm21, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm10 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 1408(%rdi), %ymm13
+; AVX512BW-ONLY-FAST-NEXT:    vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-FAST-NEXT:    vextracti128 $1, %ymm13, %xmm13
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $0, %xmm13, %zmm16, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm13 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 640(%rdi), %ymm16
+; AVX512BW-ONLY-FAST-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm16[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm16[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rdi), %ymm8
+; AVX512BW-ONLY-FAST-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 1536(%rdi), %ymm8
+; AVX512BW-ONLY-FAST-NEXT:    vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinsertf64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm12, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm12, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm12, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, 192(%rsi)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, 128(%rsi)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, 64(%rsi)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, (%rsi)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, 192(%rdx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, (%rdx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, 64(%rdx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, 192(%rcx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, (%rcx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, 64(%rcx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, 128(%rcx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, 192(%r8)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, (%r8)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 64(%r8)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 128(%r8)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm2, 192(%r9)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm2, (%r9)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm2, 64(%r9)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm2, 128(%r9)
+; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, (%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, 128(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, 128(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, (%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm8, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    addq $2120, %rsp # imm = 0x848
+; AVX512BW-ONLY-FAST-NEXT:    vzeroupper
+; AVX512BW-ONLY-FAST-NEXT:    retq
+;
+; AVX512DQBW-SLOW-LABEL: load_i64_stride7_vf32:
+; AVX512DQBW-SLOW:       # %bb.0:
+; AVX512DQBW-SLOW-NEXT:    subq $2120, %rsp # imm = 0x848
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1664(%rdi), %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1600(%rdi), %zmm31
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1216(%rdi), %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1152(%rdi), %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1088(%rdi), %zmm30
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1024(%rdi), %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 768(%rdi), %zmm21
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 704(%rdi), %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 640(%rdi), %zmm12
+; AVX512DQBW-SLOW-NEXT:    vmovaps 576(%rdi), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm9
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3]
+; AVX512DQBW-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm6, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm6, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm15, %zmm31, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [11,4,11,4,11,4,11,4]
+; AVX512DQBW-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm6, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [13,6,13,6,13,6,13,6]
+; AVX512DQBW-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm8, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm8, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5]
+; AVX512DQBW-SLOW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm19, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm19, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm6, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm28
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0]
+; AVX512DQBW-SLOW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm26, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm26, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0]
+; AVX512DQBW-SLOW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm29, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
+; AVX512DQBW-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm29, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm30, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm15, %zmm31, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1536(%rdi), %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1472(%rdi), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm19, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm15, %zmm31, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm31
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm6, %zmm31
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm15, %zmm4, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm26, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm15, %zmm4, %zmm26
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm29, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm4, %zmm15, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm1, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9]
+; AVX512DQBW-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 832(%rdi), %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1280(%rdi), %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1728(%rdi), %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10]
+; AVX512DQBW-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11]
+; AVX512DQBW-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12]
+; AVX512DQBW-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm31
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm28
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13]
+; AVX512DQBW-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm26
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14]
+; AVX512DQBW-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15]
+; AVX512DQBW-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    movb $24, %al
+; AVX512DQBW-SLOW-NEXT:    kmovd %eax, %k1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 512(%rdi), %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = <0,7,14,u>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm15, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1]
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
+; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm28
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm28
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0]
+; AVX512DQBW-SLOW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm14, %zmm16, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm24 = [4,11,4,11]
+; AVX512DQBW-SLOW-NEXT:    # ymm24 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm24, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm15, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[0,1,4,5],zmm7[4,5,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm16, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm24, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 960(%rdi), %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 896(%rdi), %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm15, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1408(%rdi), %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1344(%rdi), %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm2, %zmm7, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm6[0,1,4,5],zmm13[4,5,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm30, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm13, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm16, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm6, %zmm13, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm24, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm6, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm10[0,1,4,5],zmm30[4,5,0,1]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm30, %zmm24, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 912(%rdi), %xmm13
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $0, %xmm13, %zmm1, %zmm30
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 464(%rdi), %xmm1
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $0, %xmm1, %zmm28, %zmm28
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $0, %xmm1, %zmm9, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 1360(%rdi), %xmm1
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 1024(%rdi), %ymm13
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,11]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm18, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 576(%rdi), %ymm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rdi), %ymm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm23, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, 1536(%rdi), %ymm0, %ymm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 1472(%rdi), %ymm9
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm2, %zmm7, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm26, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = [5,12]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,13]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm26, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm26, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm7, %zmm2, %zmm26
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm2, %zmm7, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm1, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 1088(%rdi), %ymm1
+; AVX512DQBW-SLOW-NEXT:    vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm20, %zmm18
+; AVX512DQBW-SLOW-NEXT:    movb $-32, %al
+; AVX512DQBW-SLOW-NEXT:    kmovd %eax, %k2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm14 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm17 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm29 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm15 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm23 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm23 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm26 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm26 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm30 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm28 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm24 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 960(%rdi), %ymm2
+; AVX512DQBW-SLOW-NEXT:    vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
+; AVX512DQBW-SLOW-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $0, %xmm2, %zmm22, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm2 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 512(%rdi), %ymm4
+; AVX512DQBW-SLOW-NEXT:    vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
+; AVX512DQBW-SLOW-NEXT:    vextracti128 $1, %ymm4, %xmm4
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $0, %xmm4, %zmm19, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm10
+; AVX512DQBW-SLOW-NEXT:    vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
+; AVX512DQBW-SLOW-NEXT:    vextracti128 $1, %ymm10, %xmm10
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $0, %xmm10, %zmm21, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm10 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 1408(%rdi), %ymm13
+; AVX512DQBW-SLOW-NEXT:    vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
+; AVX512DQBW-SLOW-NEXT:    vextracti128 $1, %ymm13, %xmm13
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $0, %xmm13, %zmm16, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm13 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 640(%rdi), %ymm16
+; AVX512DQBW-SLOW-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm16[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm16[16,17,18,19,20,21,22,23]
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm8
+; AVX512DQBW-SLOW-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 1536(%rdi), %ymm8
+; AVX512DQBW-SLOW-NEXT:    vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vinsertf64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm12, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpblendd $240, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm12, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm12, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, 192(%rsi)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, 128(%rsi)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, 64(%rsi)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, (%rsi)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, 192(%rdx)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, (%rdx)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, 64(%rdx)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, 192(%rcx)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm27, (%rcx)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, 64(%rcx)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, 128(%rcx)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, 192(%r8)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, (%r8)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, 64(%r8)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, 128(%r8)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm2, 192(%r9)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm2, (%r9)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm2, 64(%r9)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm2, 128(%r9)
+; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, 192(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, (%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, 64(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, 128(%rax)
+; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, 128(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, 192(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, (%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm8, 64(%rax)
+; AVX512DQBW-SLOW-NEXT:    addq $2120, %rsp # imm = 0x848
+; AVX512DQBW-SLOW-NEXT:    vzeroupper
+; AVX512DQBW-SLOW-NEXT:    retq
+;
+; AVX512DQBW-FAST-LABEL: load_i64_stride7_vf32:
+; AVX512DQBW-FAST:       # %bb.0:
+; AVX512DQBW-FAST-NEXT:    subq $2120, %rsp # imm = 0x848
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1664(%rdi), %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1600(%rdi), %zmm31
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1216(%rdi), %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1152(%rdi), %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1088(%rdi), %zmm30
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1024(%rdi), %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 768(%rdi), %zmm21
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 704(%rdi), %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 640(%rdi), %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovaps 576(%rdi), %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm9
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3]
+; AVX512DQBW-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm6, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm6, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm6, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm15, %zmm31, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [11,4,11,4,11,4,11,4]
+; AVX512DQBW-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm6, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [13,6,13,6,13,6,13,6]
+; AVX512DQBW-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm8, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm17
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm8, %zmm17
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5]
+; AVX512DQBW-FAST-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm19, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm24
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm16
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm19, %zmm16
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm6, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm27
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm6, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm28
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0]
+; AVX512DQBW-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm26, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm23
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm26, %zmm23
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0]
+; AVX512DQBW-FAST-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm29, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm22
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
+; AVX512DQBW-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm29, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm9
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm30, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm21
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm15, %zmm31, %zmm21
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1536(%rdi), %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1472(%rdi), %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm25
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm19, %zmm25
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm15, %zmm31, %zmm19
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm31
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm6, %zmm31
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm15, %zmm4, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm18
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm26, %zmm18
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm15, %zmm4, %zmm26
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm20
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm29, %zmm20
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm15, %zmm29
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm1, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm15
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm1, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9]
+; AVX512DQBW-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 832(%rdi), %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm3, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1280(%rdi), %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1728(%rdi), %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm3, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10]
+; AVX512DQBW-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm3, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm3, %zmm21
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11]
+; AVX512DQBW-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm25
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm3, %zmm24
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm16
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm3, %zmm19
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12]
+; AVX512DQBW-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm31
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm3, %zmm27
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm28
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm3, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13]
+; AVX512DQBW-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm18
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm3, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm23
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm3, %zmm26
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14]
+; AVX512DQBW-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm20
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm3, %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm3, %zmm29
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15]
+; AVX512DQBW-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm3, %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm3, %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    movb $24, %al
+; AVX512DQBW-FAST-NEXT:    kmovd %eax, %k1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 512(%rdi), %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <0,7,14,u>
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm17
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm15, %zmm17
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1]
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
+; AVX512DQBW-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm14, %zmm28
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm0, %zmm28
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0]
+; AVX512DQBW-FAST-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm19
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm14, %zmm16, %zmm19
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm24 = [4,11,4,11]
+; AVX512DQBW-FAST-NEXT:    # ymm24 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm24, %zmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm14
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm15, %zmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[0,1,4,5],zmm7[4,5,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm9
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm0, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm21
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm16, %zmm21
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm24, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 960(%rdi), %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 896(%rdi), %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm29
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm15, %zmm29
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1408(%rdi), %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1344(%rdi), %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm2, %zmm7, %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm6[0,1,4,5],zmm13[4,5,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm30, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm13, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm22
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm16, %zmm22
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm6, %zmm13, %zmm16
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm24, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm6, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm10[0,1,4,5],zmm30[4,5,0,1]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm30, %zmm24, %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa 912(%rdi), %xmm13
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $0, %xmm13, %zmm1, %zmm30
+; AVX512DQBW-FAST-NEXT:    vmovdqa 464(%rdi), %xmm1
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $0, %xmm1, %zmm28, %zmm28
+; AVX512DQBW-FAST-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $0, %xmm1, %zmm9, %zmm27
+; AVX512DQBW-FAST-NEXT:    vmovdqa 1360(%rdi), %xmm1
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm24
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa 1024(%rdi), %ymm13
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,11]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm18, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa 576(%rdi), %ymm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rdi), %ymm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm0, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm23, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, 1536(%rdi), %ymm0, %ymm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa 1472(%rdi), %ymm9
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm2, %zmm7, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u>
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm23
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm26, %zmm23
+; AVX512DQBW-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = [5,12]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,13]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm1, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm26, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm0, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm1, %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm26, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm7, %zmm2, %zmm26
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm2, %zmm7, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm1, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqa 1088(%rdi), %ymm1
+; AVX512DQBW-FAST-NEXT:    vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm20, %zmm18
+; AVX512DQBW-FAST-NEXT:    movb $-32, %al
+; AVX512DQBW-FAST-NEXT:    kmovd %eax, %k2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm14 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm17 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm29 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm15 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm23 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm23 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm26 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm26 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, %zmm30 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm28 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm24 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 960(%rdi), %ymm2
+; AVX512DQBW-FAST-NEXT:    vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
+; AVX512DQBW-FAST-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $0, %xmm2, %zmm22, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 512(%rdi), %ymm4
+; AVX512DQBW-FAST-NEXT:    vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
+; AVX512DQBW-FAST-NEXT:    vextracti128 $1, %ymm4, %xmm4
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $0, %xmm4, %zmm19, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rdi), %ymm10
+; AVX512DQBW-FAST-NEXT:    vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
+; AVX512DQBW-FAST-NEXT:    vextracti128 $1, %ymm10, %xmm10
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $0, %xmm10, %zmm21, %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm10 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 1408(%rdi), %ymm13
+; AVX512DQBW-FAST-NEXT:    vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
+; AVX512DQBW-FAST-NEXT:    vextracti128 $1, %ymm13, %xmm13
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $0, %xmm13, %zmm16, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm13 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 640(%rdi), %ymm16
+; AVX512DQBW-FAST-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm16[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm16[16,17,18,19,20,21,22,23]
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rdi), %ymm8
+; AVX512DQBW-FAST-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa 1536(%rdi), %ymm8
+; AVX512DQBW-FAST-NEXT:    vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinsertf64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm12, %zmm9
+; AVX512DQBW-FAST-NEXT:    vpblendd $240, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm12, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm12, %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm15, 192(%rsi)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, 128(%rsi)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, 64(%rsi)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm14, (%rsi)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, 192(%rdx)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, (%rdx)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, 64(%rdx)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, 192(%rcx)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, (%rcx)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, 64(%rcx)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, 128(%rcx)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, 192(%r8)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, (%r8)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, 64(%r8)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, 128(%r8)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm2, 192(%r9)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm2, (%r9)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm2, 64(%r9)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm2, 128(%r9)
+; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, 192(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, (%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, 64(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, 128(%rax)
+; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, 128(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, 192(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, (%rax)
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm8, 64(%rax)
+; AVX512DQBW-FAST-NEXT:    addq $2120, %rsp # imm = 0x848
+; AVX512DQBW-FAST-NEXT:    vzeroupper
+; AVX512DQBW-FAST-NEXT:    retq
   %wide.vec = load <224 x i64>, ptr %in.vec, align 64
   %strided.vec0 = shufflevector <224 x i64> %wide.vec, <224 x i64> poison, <32 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49, i32 56, i32 63, i32 70, i32 77, i32 84, i32 91, i32 98, i32 105, i32 112, i32 119, i32 126, i32 133, i32 140, i32 147, i32 154, i32 161, i32 168, i32 175, i32 182, i32 189, i32 196, i32 203, i32 210, i32 217>
   %strided.vec1 = shufflevector <224 x i64> %wide.vec, <224 x i64> poison, <32 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50, i32 57, i32 64, i32 71, i32 78, i32 85, i32 92, i32 99, i32 106, i32 113, i32 120, i32 127, i32 134, i32 141, i32 148, i32 155, i32 162, i32 169, i32 176, i32 183, i32 190, i32 197, i32 204, i32 211, i32 218>
@@ -8696,7 +11339,8 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm1
 ; AVX512F-NEXT:    vpermt2q %zmm11, %zmm2, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,4,11>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [4,11,4,11]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpermt2q %zmm6, %zmm3, %zmm11
 ; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm29 # 64-byte Reload
@@ -9677,7 +12321,8 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm6, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,4,11>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [4,11,4,11]
+; AVX512BW-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
@@ -10073,14 +12718,6 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST: {{.*}}
 ; AVX2-FAST-PERLANE: {{.*}}
 ; AVX2-SLOW: {{.*}}
-; AVX512BW-ONLY-FAST: {{.*}}
-; AVX512BW-ONLY-SLOW: {{.*}}
-; AVX512DQ-FAST: {{.*}}
-; AVX512DQ-SLOW: {{.*}}
-; AVX512DQBW-FAST: {{.*}}
-; AVX512DQBW-SLOW: {{.*}}
-; AVX512F-ONLY-FAST: {{.*}}
-; AVX512F-ONLY-SLOW: {{.*}}
 ; FALLBACK0: {{.*}}
 ; FALLBACK1: {{.*}}
 ; FALLBACK10: {{.*}}

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll
index e7067d4f6bd76..439265452cc47 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll
@@ -371,22 +371,26 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
 ; AVX512-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
 ; AVX512-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,4,12>
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [4,12,4,12]
+; AVX512-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpermi2q %zmm5, %zmm6, %zmm2
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm11 = [4,12]
 ; AVX512-NEXT:    vpermi2q %zmm3, %zmm4, %zmm11
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,5,13>
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
+; AVX512-NEXT:    # ymm11 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpermi2q %zmm5, %zmm6, %zmm11
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm12 = [5,13]
 ; AVX512-NEXT:    vpermi2q %zmm3, %zmm4, %zmm12
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,6,14>
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [6,14,6,14]
+; AVX512-NEXT:    # ymm12 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpermi2q %zmm5, %zmm6, %zmm12
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm13 = [6,14]
 ; AVX512-NEXT:    vpermi2q %zmm3, %zmm4, %zmm13
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,7,15>
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [7,15,7,15]
+; AVX512-NEXT:    # ymm13 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpermi2q %zmm5, %zmm6, %zmm13
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = [7,15]
 ; AVX512-NEXT:    vpermi2q %zmm3, %zmm4, %zmm5
@@ -914,7 +918,8 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-NEXT:    vpermt2q %zmm6, %zmm14, %zmm9
 ; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm7 {%k1}
 ; AVX512F-NEXT:    vpermi2q %zmm4, %zmm2, %zmm14
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,4,12>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [4,12,4,12]
+; AVX512F-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpermi2q %zmm0, %zmm1, %zmm8
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm7, %zmm7
@@ -922,7 +927,8 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermt2q %zmm6, %zmm8, %zmm16
 ; AVX512F-NEXT:    vpermi2q %zmm4, %zmm2, %zmm8
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,5,13>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [5,13,5,13]
+; AVX512F-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpermi2q %zmm0, %zmm1, %zmm6
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm5 {%k1}
@@ -945,12 +951,14 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm17 {%k1}
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm17, %zmm6
 ; AVX512F-NEXT:    vpermi2q %zmm4, %zmm2, %zmm15
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,6,14>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14]
+; AVX512F-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpermi2q %zmm0, %zmm1, %zmm9
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm9, %zmm19, %zmm9
 ; AVX512F-NEXT:    vpermt2q %zmm4, %zmm20, %zmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,7,15>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpermi2q %zmm0, %zmm1, %zmm4
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
@@ -1028,7 +1036,8 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm14, %zmm9
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm7 {%k1}
 ; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm14
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,4,12>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [4,12,4,12]
+; AVX512BW-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm8
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm7, %zmm7
@@ -1036,7 +1045,8 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm8, %zmm16
 ; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm8
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,5,13>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [5,13,5,13]
+; AVX512BW-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm6
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm5 {%k1}
@@ -1059,12 +1069,14 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm17 {%k1}
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm17, %zmm6
 ; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm15
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,6,14>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14]
+; AVX512BW-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm9
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm19, %zmm9
 ; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm20, %zmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,7,15>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15]
+; AVX512BW-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm4
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
@@ -2123,7 +2135,8 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1 {%k1}
 ; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4
 ; AVX512F-NEXT:    vpermt2q %zmm27, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,4,12>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12]
+; AVX512F-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm14
 ; AVX512F-NEXT:    vpermt2q %zmm25, %zmm5, %zmm14
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm14[4,5,6,7]
@@ -2141,7 +2154,8 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm1
 ; AVX512F-NEXT:    vpermt2q %zmm27, %zmm14, %zmm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,5,13>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13]
+; AVX512F-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm4
 ; AVX512F-NEXT:    vpermt2q %zmm25, %zmm2, %zmm4
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm4[4,5,6,7]
@@ -2186,10 +2200,12 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vpermt2q %zmm27, %zmm1, %zmm3
 ; AVX512F-NEXT:    vpermi2q %zmm30, %zmm6, %zmm5
 ; AVX512F-NEXT:    vpermt2q %zmm30, %zmm1, %zmm6
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,6,14>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512F-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm14
 ; AVX512F-NEXT:    vpermt2q %zmm25, %zmm1, %zmm14
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,7,15>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [7,15,7,15]
+; AVX512F-NEXT:    # ymm15 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpermt2q %zmm25, %zmm15, %zmm13
 ; AVX512F-NEXT:    vpermi2q %zmm16, %zmm0, %zmm1
 ; AVX512F-NEXT:    vpermt2q %zmm16, %zmm15, %zmm0
@@ -2349,7 +2365,8 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4
 ; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm0, %zmm4
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,4,12>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12]
+; AVX512BW-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm14
 ; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm5, %zmm14
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm14[4,5,6,7]
@@ -2367,7 +2384,8 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm1
 ; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm14, %zmm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,5,13>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13]
+; AVX512BW-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm4
 ; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm2, %zmm4
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm4[4,5,6,7]
@@ -2412,10 +2430,12 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm1, %zmm3
 ; AVX512BW-NEXT:    vpermi2q %zmm30, %zmm6, %zmm5
 ; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm1, %zmm6
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,6,14>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm14
 ; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm1, %zmm14
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,7,15>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [7,15,7,15]
+; AVX512BW-NEXT:    # ymm15 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm15, %zmm13
 ; AVX512BW-NEXT:    vpermi2q %zmm16, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm15, %zmm0
@@ -4619,7 +4639,8 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm12
 ; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vpermt2q %zmm11, %zmm0, %zmm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,4,12>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12]
+; AVX512F-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm4
 ; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm14
 ; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -4717,7 +4738,8 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm8 {%k1}
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
 ; AVX512F-NEXT:    vpermt2q %zmm19, %zmm2, %zmm12
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,5,13>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13]
+; AVX512F-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm15
 ; AVX512F-NEXT:    vpermt2q %zmm9, %zmm7, %zmm15
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm15[4,5,6,7]
@@ -4827,12 +4849,14 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm7[4,5,6,7]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,6,14>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [6,14,6,14]
+; AVX512F-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-NEXT:    vpermt2q %zmm1, %zmm0, %zmm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,7,15>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [7,15,7,15]
+; AVX512F-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpermt2q %zmm1, %zmm7, %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm1
@@ -5151,7 +5175,8 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm12
 ; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,4,12>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12]
+; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm4
 ; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm14
 ; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -5249,7 +5274,8 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm8 {%k1}
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
 ; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm2, %zmm12
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,5,13>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13]
+; AVX512BW-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm15
 ; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm7, %zmm15
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm15[4,5,6,7]
@@ -5359,12 +5385,14 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm7[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,6,14>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [6,14,6,14]
+; AVX512BW-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,7,15>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [7,15,7,15]
+; AVX512BW-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm1
@@ -9640,2267 +9668,9085 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;
-; AVX512F-LABEL: load_i64_stride8_vf64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    subq $6728, %rsp # imm = 0x1A48
-; AVX512F-NEXT:    vmovdqa64 3392(%rdi), %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 3328(%rdi), %zmm13
-; AVX512F-NEXT:    vmovdqa64 3520(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 3456(%rdi), %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1856(%rdi), %zmm11
-; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1984(%rdi), %zmm15
-; AVX512F-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 832(%rdi), %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 768(%rdi), %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 960(%rdi), %zmm24
-; AVX512F-NEXT:    vmovdqa64 896(%rdi), %zmm10
-; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm12
-; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm5
-; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm14
-; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    movb $-64, %al
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
-; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT:    vmovdqa64 3264(%rdi), %ymm21
-; AVX512F-NEXT:    vmovdqa 3200(%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2]
-; AVX512F-NEXT:    vmovdqa 3136(%rdi), %ymm4
-; AVX512F-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT:    vmovdqa 3072(%rdi), %ymm3
-; AVX512F-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm2, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT:    vmovdqa 704(%rdi), %ymm3
-; AVX512F-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT:    vmovdqa 640(%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512F-NEXT:    vmovdqa64 576(%rdi), %ymm25
-; AVX512F-NEXT:    vmovdqa64 512(%rdi), %ymm23
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm2, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT:    vmovdqa 192(%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT:    vmovdqa64 128(%rdi), %ymm31
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2]
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %ymm20
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %ymm19
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vmovdqa64 1920(%rdi), %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqa64 1792(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm2, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT:    vmovdqa 1728(%rdi), %ymm3
-; AVX512F-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT:    vmovdqa 1664(%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512F-NEXT:    vmovdqa64 1600(%rdi), %ymm18
-; AVX512F-NEXT:    vmovdqa64 1536(%rdi), %ymm26
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1472(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1408(%rdi), %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqa64 1344(%rdi), %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1280(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT:    vmovdqa64 1216(%rdi), %ymm28
-; AVX512F-NEXT:    vmovdqa64 1152(%rdi), %ymm29
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2]
-; AVX512F-NEXT:    vmovdqa64 1088(%rdi), %ymm30
-; AVX512F-NEXT:    vmovdqa64 1024(%rdi), %ymm27
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 3008(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 2944(%rdi), %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqa64 2880(%rdi), %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 2816(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT:    vmovdqa 2752(%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT:    vmovdqa 2688(%rdi), %ymm11
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
-; AVX512F-NEXT:    vmovdqa64 2624(%rdi), %ymm16
-; AVX512F-NEXT:    vmovdqa 2560(%rdi), %ymm9
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 2496(%rdi), %zmm10
-; AVX512F-NEXT:    vmovdqa64 2432(%rdi), %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 2368(%rdi), %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 2304(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT:    vmovdqa 2240(%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT:    vmovdqa 2176(%rdi), %ymm3
-; AVX512F-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
-; AVX512F-NEXT:    vmovdqa 2112(%rdi), %ymm3
-; AVX512F-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT:    vmovdqa 2048(%rdi), %ymm8
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 4032(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 3968(%rdi), %zmm6
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqa64 3904(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqa64 3840(%rdi), %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512F-NEXT:    vmovdqa64 3776(%rdi), %ymm22
-; AVX512F-NEXT:    vmovdqa64 3712(%rdi), %ymm17
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2]
-; AVX512F-NEXT:    vmovdqa 3648(%rdi), %ymm12
-; AVX512F-NEXT:    vmovdqa 3584(%rdi), %ymm0
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm14, %zmm2, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
-; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm14
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm15
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
-; AVX512F-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
-; AVX512F-NEXT:    vmovdqu (%rsp), %ymm13 # 32-byte Reload
-; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
-; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm14
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512F-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm13
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm23, %zmm2, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm5, %zmm13
-; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm13
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm2, %zmm13
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm2, %zmm14
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512F-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm13
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm2, %zmm13
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm13
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm2, %zmm13
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm14
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm2, %zmm14
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm9, %zmm14, %zmm9
-; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm2, %zmm7
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm10
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k1}
-; AVX512F-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3]
-; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm10, %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm2, %zmm6
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2q %zmm3, %zmm8, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm2 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
-; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm13
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm0, %zmm1
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512F-NEXT:    vmovdqa64 3136(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 3072(%rdi), %zmm15
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm0, %zmm3
-; AVX512F-NEXT:    vmovdqa64 3264(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 3200(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,4,12>
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-NEXT:    vmovdqa64 576(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 512(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 704(%rdi), %zmm21
-; AVX512F-NEXT:    vmovdqa64 640(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm6, %zmm4
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-NEXT:    vmovdqa64 1600(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1536(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 1728(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1664(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-NEXT:    vmovdqa64 1088(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1024(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 1216(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1152(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm0, %zmm2
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-NEXT:    vmovdqa64 2624(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 2560(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 2752(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 2688(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 2112(%rdi), %zmm7
-; AVX512F-NEXT:    vmovdqa64 2048(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 2240(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 2176(%rdi), %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm6, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 3648(%rdi), %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 3584(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm3, %zmm1, %zmm0
-; AVX512F-NEXT:    vmovdqa64 3776(%rdi), %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 3712(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm3, %zmm1, %zmm6
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
-; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm0, %zmm15
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,5,13>
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm0, %zmm13
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm4 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm0, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm19
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm27
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm1, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
-; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,6,14>
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm3
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512F-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm0, %zmm3
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm0, %zmm14
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm18
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm0, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm0, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm0, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm0, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm1, %zmm8
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm0, %zmm5
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
-; AVX512F-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm30
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm5
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15]
-; AVX512F-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm16, %zmm0
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7]
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8]
-; AVX512F-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9]
-; AVX512F-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm13, %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm3, %zmm30
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm13, %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm16, %zmm1
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7]
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm3, %zmm20
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm13, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm3, %zmm18
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm13, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm28
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm16, %zmm12
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7]
-; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm3, %zmm26
-; AVX512F-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm13, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm3, %zmm28
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm13, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm25
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm16, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm7
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7]
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm3, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm13, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm3, %zmm25
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm13, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm27
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm16, %zmm9
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7]
-; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm3, %zmm22
-; AVX512F-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm13, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm3, %zmm27
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm13, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm29
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm16, %zmm31
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm19
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm3, %zmm19
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm13, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm3, %zmm29
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm3, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm22
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm3, %zmm22
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm23, %zmm3, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2q %zmm26, %zmm15, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm17
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm13, %zmm17
-; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm23, %zmm13, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm26, %zmm15, %zmm13
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm16, %zmm15
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm21
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm15
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm16, %zmm15
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm23
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm16 = <u,u,7,15>
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm15, %zmm12, %zmm12
-; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm15, %zmm31, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm15, %zmm21, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm15, %zmm23, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
-; AVX512F-NEXT:    vmovdqa 192(%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-NEXT:    vmovdqa 128(%rdi), %xmm1
-; AVX512F-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-NEXT:    vmovdqa 64(%rdi), %xmm1
-; AVX512F-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm2, %ymm16, %ymm2
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm18, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k1}
-; AVX512F-NEXT:    vmovdqa 704(%rdi), %xmm8
-; AVX512F-NEXT:    vmovdqa64 640(%rdi), %xmm20
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0]
-; AVX512F-NEXT:    vmovdqa64 576(%rdi), %xmm18
-; AVX512F-NEXT:    vmovdqa64 512(%rdi), %xmm16
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm2, %ymm31, %ymm2
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm30, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
-; AVX512F-NEXT:    vmovdqa 1216(%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-NEXT:    vmovdqa 1152(%rdi), %xmm1
-; AVX512F-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0]
-; AVX512F-NEXT:    vmovdqa64 1088(%rdi), %xmm23
-; AVX512F-NEXT:    vmovdqa 1024(%rdi), %xmm14
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm31, %ymm30, %ymm30
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm30, %zmm25, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
-; AVX512F-NEXT:    vmovdqa64 1728(%rdi), %xmm30
-; AVX512F-NEXT:    vmovdqa64 1664(%rdi), %xmm31
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0]
-; AVX512F-NEXT:    vmovdqa 1600(%rdi), %xmm12
-; AVX512F-NEXT:    vmovdqa 1536(%rdi), %xmm11
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm28, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm29 {%k1}
-; AVX512F-NEXT:    vmovdqa64 2240(%rdi), %xmm19
-; AVX512F-NEXT:    vmovdqa64 2176(%rdi), %xmm21
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0]
-; AVX512F-NEXT:    vmovdqa 2112(%rdi), %xmm15
-; AVX512F-NEXT:    vmovdqa 2048(%rdi), %xmm1
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm25, %ymm28, %ymm25
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm25, %zmm29, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
-; AVX512F-NEXT:    vmovdqa64 2752(%rdi), %xmm28
-; AVX512F-NEXT:    vmovdqa64 2688(%rdi), %xmm29
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0]
-; AVX512F-NEXT:    vmovdqa 2624(%rdi), %xmm10
-; AVX512F-NEXT:    vmovdqa 2560(%rdi), %xmm9
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm27, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512F-NEXT:    vmovdqa64 3264(%rdi), %xmm25
-; AVX512F-NEXT:    vmovdqa64 3200(%rdi), %xmm27
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0]
-; AVX512F-NEXT:    vmovdqa 3136(%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqa 3072(%rdi), %xmm2
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm4, %ymm26, %ymm4
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm22, %zmm22
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k1}
-; AVX512F-NEXT:    vmovdqa 3776(%rdi), %xmm7
-; AVX512F-NEXT:    vmovdqa64 3712(%rdi), %xmm26
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0]
-; AVX512F-NEXT:    vmovdqa 3648(%rdi), %xmm6
-; AVX512F-NEXT:    vmovdqa 3584(%rdi), %xmm5
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm4, %ymm24, %ymm4
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm24
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm17 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm2
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm17, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1]
-; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k1}
-; AVX512F-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX512F-NEXT:    # xmm4 = xmm4[1],mem[1]
-; AVX512F-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX512F-NEXT:    # xmm8 = xmm8[1],mem[1]
-; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm8, %ymm4
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1]
-; AVX512F-NEXT:    vinserti128 $1, %xmm8, %ymm11, %ymm8
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm8
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
-; AVX512F-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
-; AVX512F-NEXT:    # xmm11 = xmm11[1],mem[1]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1]
-; AVX512F-NEXT:    vinserti128 $1, %xmm11, %ymm12, %ymm11
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm11
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm0 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1]
-; AVX512F-NEXT:    vinserti128 $1, %xmm12, %ymm9, %ymm9
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm9
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm0 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1]
-; AVX512F-NEXT:    vinserti128 $1, %xmm10, %ymm12, %ymm10
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm10
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
-; AVX512F-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm24, 448(%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm22, 384(%rsi)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 320(%rsi)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 256(%rsi)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 192(%rsi)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 128(%rsi)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 64(%rsi)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, (%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm1, 448(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm10, 256(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm9, 320(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm11, 128(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm4, (%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm2, 384(%rdx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 448(%rcx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 256(%rcx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 320(%rcx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 128(%rcx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 192(%rcx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, (%rcx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 64(%rcx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 384(%rcx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 448(%r8)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 256(%r8)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 320(%r8)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 128(%r8)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 192(%r8)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, (%r8)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 64(%r8)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 384(%r8)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 448(%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 256(%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 320(%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 128(%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 192(%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, (%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 64(%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 384(%r9)
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512F-NEXT:    addq $6728, %rsp # imm = 0x1A48
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: load_i64_stride8_vf64:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    subq $6728, %rsp # imm = 0x1A48
-; AVX512BW-NEXT:    vmovdqa64 3392(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 3328(%rdi), %zmm13
-; AVX512BW-NEXT:    vmovdqa64 3520(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 3456(%rdi), %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1856(%rdi), %zmm11
-; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1984(%rdi), %zmm15
-; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm24
-; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm10
-; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm12
-; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm8
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm14
-; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    movb $-64, %al
-; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 3264(%rdi), %ymm21
-; AVX512BW-NEXT:    vmovdqa 3200(%rdi), %ymm0
-; AVX512BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2]
-; AVX512BW-NEXT:    vmovdqa 3136(%rdi), %ymm4
-; AVX512BW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vmovdqa 3072(%rdi), %ymm3
-; AVX512BW-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm2, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa 704(%rdi), %ymm3
-; AVX512BW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vmovdqa 640(%rdi), %ymm0
-; AVX512BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %ymm25
-; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %ymm23
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm2, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa 192(%rdi), %ymm0
-; AVX512BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %ymm31
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2]
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %ymm20
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %ymm19
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vmovdqa64 1920(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 1792(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa 1728(%rdi), %ymm3
-; AVX512BW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vmovdqa 1664(%rdi), %ymm0
-; AVX512BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %ymm18
-; AVX512BW-NEXT:    vmovdqa64 1536(%rdi), %ymm26
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1472(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1408(%rdi), %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 1344(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1280(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %ymm28
-; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %ymm29
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2]
-; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %ymm30
-; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %ymm27
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 3008(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 2944(%rdi), %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 2880(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 2816(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa 2752(%rdi), %ymm0
-; AVX512BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vmovdqa 2688(%rdi), %ymm11
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
-; AVX512BW-NEXT:    vmovdqa64 2624(%rdi), %ymm16
-; AVX512BW-NEXT:    vmovdqa 2560(%rdi), %ymm9
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 2496(%rdi), %zmm10
-; AVX512BW-NEXT:    vmovdqa64 2432(%rdi), %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 2368(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 2304(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa 2240(%rdi), %ymm0
-; AVX512BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vmovdqa 2176(%rdi), %ymm3
-; AVX512BW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
-; AVX512BW-NEXT:    vmovdqa 2112(%rdi), %ymm3
-; AVX512BW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vmovdqa 2048(%rdi), %ymm8
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 4032(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 3968(%rdi), %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 3904(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqa64 3840(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm3, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 3776(%rdi), %ymm22
-; AVX512BW-NEXT:    vmovdqa64 3712(%rdi), %ymm17
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2]
-; AVX512BW-NEXT:    vmovdqa 3648(%rdi), %ymm12
-; AVX512BW-NEXT:    vmovdqa 3584(%rdi), %ymm0
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm14, %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm14
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm15
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
-; AVX512BW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
-; AVX512BW-NEXT:    vmovdqu (%rsp), %ymm13 # 32-byte Reload
-; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm14
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512BW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm2, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm5, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm2, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm2, %zmm14
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512BW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm2, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm14
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm2, %zmm14
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm14, %zmm9
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm10
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k1}
-; AVX512BW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm10, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm2, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm8, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 3136(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 3072(%rdi), %zmm15
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 3264(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 3200(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,4,12>
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm21
-; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm6, %zmm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1536(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 1728(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1664(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 2624(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 2560(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 2752(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 2688(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 2112(%rdi), %zmm7
-; AVX512BW-NEXT:    vmovdqa64 2048(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 2240(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 2176(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 3648(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 3584(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 3776(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 3712(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm1, %zmm6
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm15
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,5,13>
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm13
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm4 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm19
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm27
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,6,14>
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm3
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm3
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm14
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm18
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm8
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm0, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm30
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm5
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm16, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7]
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8]
-; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm13, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm3, %zmm30
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm13, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm16, %zmm1
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7]
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm3, %zmm20
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm13, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm3, %zmm18
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm13, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm28
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm16, %zmm12
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7]
-; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm3, %zmm26
-; AVX512BW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm13, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm3, %zmm28
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm13, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm25
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm16, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm7
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7]
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm3, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm13, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm3, %zmm25
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm13, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm27
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm16, %zmm9
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7]
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm3, %zmm22
-; AVX512BW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm13, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm3, %zmm27
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm13, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm29
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm16, %zmm31
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm19
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm3, %zmm19
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm13, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm3, %zmm29
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm3, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm22
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm3, %zmm22
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm3, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2q %zmm26, %zmm15, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm17
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm13, %zmm17
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm13, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm26, %zmm15, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm16, %zmm15
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm21
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm16, %zmm15
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm23
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm16 = <u,u,7,15>
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm15, %zmm12, %zmm12
-; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm15, %zmm31, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm15, %zmm21, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm15, %zmm23, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
-; AVX512BW-NEXT:    vmovdqa 192(%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vmovdqa 128(%rdi), %xmm1
-; AVX512BW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm1
-; AVX512BW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm2, %ymm16, %ymm2
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm18, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k1}
-; AVX512BW-NEXT:    vmovdqa 704(%rdi), %xmm8
-; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %xmm20
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0]
-; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %xmm18
-; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %xmm16
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm2, %ymm31, %ymm2
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm30, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
-; AVX512BW-NEXT:    vmovdqa 1216(%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vmovdqa 1152(%rdi), %xmm1
-; AVX512BW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0]
-; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %xmm23
-; AVX512BW-NEXT:    vmovdqa 1024(%rdi), %xmm14
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm31, %ymm30, %ymm30
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm30, %zmm25, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 1728(%rdi), %xmm30
-; AVX512BW-NEXT:    vmovdqa64 1664(%rdi), %xmm31
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0]
-; AVX512BW-NEXT:    vmovdqa 1600(%rdi), %xmm12
-; AVX512BW-NEXT:    vmovdqa 1536(%rdi), %xmm11
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm28, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm29 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 2240(%rdi), %xmm19
-; AVX512BW-NEXT:    vmovdqa64 2176(%rdi), %xmm21
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0]
-; AVX512BW-NEXT:    vmovdqa 2112(%rdi), %xmm15
-; AVX512BW-NEXT:    vmovdqa 2048(%rdi), %xmm1
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm25, %ymm28, %ymm25
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm25, %zmm29, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 2752(%rdi), %xmm28
-; AVX512BW-NEXT:    vmovdqa64 2688(%rdi), %xmm29
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0]
-; AVX512BW-NEXT:    vmovdqa 2624(%rdi), %xmm10
-; AVX512BW-NEXT:    vmovdqa 2560(%rdi), %xmm9
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm27, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 3264(%rdi), %xmm25
-; AVX512BW-NEXT:    vmovdqa64 3200(%rdi), %xmm27
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0]
-; AVX512BW-NEXT:    vmovdqa 3136(%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa 3072(%rdi), %xmm2
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm4, %ymm26, %ymm4
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm22, %zmm22
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k1}
-; AVX512BW-NEXT:    vmovdqa 3776(%rdi), %xmm7
-; AVX512BW-NEXT:    vmovdqa64 3712(%rdi), %xmm26
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0]
-; AVX512BW-NEXT:    vmovdqa 3648(%rdi), %xmm6
-; AVX512BW-NEXT:    vmovdqa 3584(%rdi), %xmm5
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm4, %ymm24, %ymm4
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm24
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm17 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm2
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm17, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX512BW-NEXT:    # xmm4 = xmm4[1],mem[1]
-; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX512BW-NEXT:    # xmm8 = xmm8[1],mem[1]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm8, %ymm4
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm8, %ymm11, %ymm8
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
-; AVX512BW-NEXT:    # xmm11 = xmm11[1],mem[1]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm11, %ymm12, %ymm11
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm11
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm12, %ymm9, %ymm9
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm9
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm10, %ymm12, %ymm10
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, 448(%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, 384(%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 320(%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 256(%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 192(%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 128(%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 64(%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, (%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, 448(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, 256(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, 320(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, 128(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, (%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, 384(%rdx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 448(%rcx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 256(%rcx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 320(%rcx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 128(%rcx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 192(%rcx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, (%rcx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 64(%rcx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 384(%rcx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 448(%r8)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 256(%r8)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 320(%r8)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 128(%r8)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 192(%r8)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, (%r8)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 64(%r8)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 384(%r8)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 448(%r9)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 256(%r9)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 320(%r9)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 128(%r9)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 192(%r9)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, (%r9)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 64(%r9)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 384(%r9)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512BW-NEXT:    addq $6728, %rsp # imm = 0x1A48
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX512F-ONLY-SLOW-LABEL: load_i64_stride8_vf64:
+; AVX512F-ONLY-SLOW:       # %bb.0:
+; AVX512F-ONLY-SLOW-NEXT:    subq $6728, %rsp # imm = 0x1A48
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3392(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3328(%rdi), %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3520(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3456(%rdi), %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1856(%rdi), %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1984(%rdi), %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 832(%rdi), %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 768(%rdi), %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 960(%rdi), %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 896(%rdi), %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    movb $-64, %al
+; AVX512F-ONLY-SLOW-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3264(%rdi), %ymm21
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 3200(%rdi), %ymm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 3136(%rdi), %ymm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 3072(%rdi), %ymm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 704(%rdi), %ymm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 640(%rdi), %ymm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 576(%rdi), %ymm25
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 512(%rdi), %ymm23
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %ymm31
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %ymm20
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %ymm19
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1920(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1792(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 1728(%rdi), %ymm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 1664(%rdi), %ymm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1600(%rdi), %ymm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1536(%rdi), %ymm26
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1472(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1408(%rdi), %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1344(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1280(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1216(%rdi), %ymm28
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1152(%rdi), %ymm29
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1088(%rdi), %ymm30
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1024(%rdi), %ymm27
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3008(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2944(%rdi), %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2880(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2816(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 2752(%rdi), %ymm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 2688(%rdi), %ymm11
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2624(%rdi), %ymm16
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 2560(%rdi), %ymm9
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2496(%rdi), %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2432(%rdi), %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2368(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2304(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 2240(%rdi), %ymm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 2176(%rdi), %ymm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 2112(%rdi), %ymm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 2048(%rdi), %ymm8
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 4032(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3968(%rdi), %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3904(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3840(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3776(%rdi), %ymm22
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3712(%rdi), %ymm17
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 3648(%rdi), %ymm12
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 3584(%rdi), %ymm0
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm14, %zmm2, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu (%rsp), %ymm13 # 32-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm2, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm5, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm2, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm2, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm2, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm2, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm14, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm10, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm2, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm8, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm2 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3136(%rdi), %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3072(%rdi), %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3264(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3200(%rdi), %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm6 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 576(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 512(%rdi), %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 704(%rdi), %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 640(%rdi), %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1600(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1536(%rdi), %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1728(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1664(%rdi), %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1088(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1024(%rdi), %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1216(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1152(%rdi), %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2624(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2560(%rdi), %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2752(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2688(%rdi), %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2112(%rdi), %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2048(%rdi), %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2240(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2176(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3648(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3584(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3776(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3712(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm1, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm30
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm16, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm13, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm3, %zmm30
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm13, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm16, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm3, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm13, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm3, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm13, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm16, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm3, %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm13, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm3, %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm13, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm16, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm3, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm13, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm3, %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm13, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm16, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm3, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm13, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm3, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm13, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm16, %zmm31
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm3, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm13, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm3, %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm3, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm3, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm3, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm26, %zmm15, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm13, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm13, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm26, %zmm15, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm16, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm16, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,7,15]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm12, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm31, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm21, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm23, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rdi), %xmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rdi), %xmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rdi), %xmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm16, %ymm2
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm18, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 704(%rdi), %xmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 640(%rdi), %xmm20
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 576(%rdi), %xmm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 512(%rdi), %xmm16
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm31, %ymm2
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm30, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 1216(%rdi), %xmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 1152(%rdi), %xmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1088(%rdi), %xmm23
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 1024(%rdi), %xmm14
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm31, %ymm30, %ymm30
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm30, %zmm25, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1728(%rdi), %xmm30
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1664(%rdi), %xmm31
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 1600(%rdi), %xmm12
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 1536(%rdi), %xmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm28, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm29 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2240(%rdi), %xmm19
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2176(%rdi), %xmm21
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 2112(%rdi), %xmm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 2048(%rdi), %xmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm25, %ymm28, %ymm25
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm25, %zmm29, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2752(%rdi), %xmm28
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2688(%rdi), %xmm29
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 2624(%rdi), %xmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 2560(%rdi), %xmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm27, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3264(%rdi), %xmm25
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3200(%rdi), %xmm27
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 3136(%rdi), %xmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 3072(%rdi), %xmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm26, %ymm4
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm22, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 3776(%rdi), %xmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3712(%rdi), %xmm26
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 3648(%rdi), %xmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 3584(%rdi), %xmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm24, %ymm4
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm17 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm2
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm17, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # xmm4 = xmm4[1],mem[1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # xmm8 = xmm8[1],mem[1]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm8, %ymm4
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm8, %ymm11, %ymm8
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # xmm11 = xmm11[1],mem[1]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm11, %ymm12, %ymm11
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm0 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm12, %ymm9, %ymm9
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm10, %ymm12, %ymm10
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, 448(%rsi)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, 384(%rsi)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%rsi)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%rsi)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 192(%rsi)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%rsi)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%rsi)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%rsi)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 448(%rdx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 256(%rdx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 320(%rdx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, 128(%rdx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, (%rdx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 384(%rdx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%rcx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%rcx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%rcx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%rcx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 192(%rcx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%rcx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%rcx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 384(%rcx)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%r8)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%r8)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%r8)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%r8)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 192(%r8)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%r8)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%r8)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 384(%r8)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%r9)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%r9)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%r9)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%r9)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 192(%r9)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%r9)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%r9)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 384(%r9)
+; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    addq $6728, %rsp # imm = 0x1A48
+; AVX512F-ONLY-SLOW-NEXT:    vzeroupper
+; AVX512F-ONLY-SLOW-NEXT:    retq
+;
+; AVX512F-ONLY-FAST-LABEL: load_i64_stride8_vf64:
+; AVX512F-ONLY-FAST:       # %bb.0:
+; AVX512F-ONLY-FAST-NEXT:    subq $6728, %rsp # imm = 0x1A48
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3392(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3328(%rdi), %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3520(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3456(%rdi), %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1856(%rdi), %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1984(%rdi), %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 832(%rdi), %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 768(%rdi), %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 960(%rdi), %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 896(%rdi), %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    movb $-64, %al
+; AVX512F-ONLY-FAST-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512F-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3264(%rdi), %ymm21
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 3200(%rdi), %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 3136(%rdi), %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 3072(%rdi), %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 704(%rdi), %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 640(%rdi), %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 576(%rdi), %ymm25
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 512(%rdi), %ymm23
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rdi), %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %ymm31
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %ymm20
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %ymm19
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1920(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1792(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 1728(%rdi), %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 1664(%rdi), %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1600(%rdi), %ymm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1536(%rdi), %ymm26
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1472(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1408(%rdi), %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1344(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1280(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1216(%rdi), %ymm28
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1152(%rdi), %ymm29
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1088(%rdi), %ymm30
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1024(%rdi), %ymm27
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3008(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2944(%rdi), %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2880(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2816(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 2752(%rdi), %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 2688(%rdi), %ymm11
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2624(%rdi), %ymm16
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 2560(%rdi), %ymm9
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2496(%rdi), %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2432(%rdi), %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2368(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2304(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 2240(%rdi), %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 2176(%rdi), %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 2112(%rdi), %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 2048(%rdi), %ymm8
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 4032(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3968(%rdi), %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3904(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3840(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3776(%rdi), %ymm22
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3712(%rdi), %ymm17
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 3648(%rdi), %ymm12
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 3584(%rdi), %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm14, %zmm2, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
+; AVX512F-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu (%rsp), %ymm13 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm2, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm5, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm2, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm2, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm2, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm2, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm2, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm14, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm2, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm10, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm2, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm8, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm2 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
+; AVX512F-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3136(%rdi), %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3072(%rdi), %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3264(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3200(%rdi), %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12]
+; AVX512F-ONLY-FAST-NEXT:    # ymm6 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 576(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 512(%rdi), %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 704(%rdi), %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 640(%rdi), %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1600(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1536(%rdi), %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1728(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1664(%rdi), %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1088(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1024(%rdi), %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1216(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1152(%rdi), %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2624(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2560(%rdi), %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2752(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2688(%rdi), %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2112(%rdi), %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2048(%rdi), %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2240(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2176(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3648(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3584(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3776(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3712(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm1, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
+; AVX512F-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13]
+; AVX512F-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm0, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm1, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15]
+; AVX512F-ONLY-FAST-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm16, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8]
+; AVX512F-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9]
+; AVX512F-ONLY-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm13, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm3, %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm13, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm16, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm3, %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm13, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm3, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm13, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm16, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm3, %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm13, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm3, %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm13, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm16, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm3, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm13, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm3, %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm13, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm16, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm3, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm13, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm3, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm13, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm16, %zmm31
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm3, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm13, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm3, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm3, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm26, %zmm15, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm13, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm13, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm26, %zmm15, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm16, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm16, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,7,15]
+; AVX512F-ONLY-FAST-NEXT:    # ymm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm12, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm31, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm21, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm23, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rdi), %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rdi), %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rdi), %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm16, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm18, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 704(%rdi), %xmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 640(%rdi), %xmm20
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 576(%rdi), %xmm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 512(%rdi), %xmm16
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm31, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm30, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 1216(%rdi), %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 1152(%rdi), %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1088(%rdi), %xmm23
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 1024(%rdi), %xmm14
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm31, %ymm30, %ymm30
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm30, %zmm25, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1728(%rdi), %xmm30
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1664(%rdi), %xmm31
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 1600(%rdi), %xmm12
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 1536(%rdi), %xmm11
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm28, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm29 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2240(%rdi), %xmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2176(%rdi), %xmm21
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 2112(%rdi), %xmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 2048(%rdi), %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm25, %ymm28, %ymm25
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm25, %zmm29, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2752(%rdi), %xmm28
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2688(%rdi), %xmm29
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 2624(%rdi), %xmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 2560(%rdi), %xmm9
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm27, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3264(%rdi), %xmm25
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3200(%rdi), %xmm27
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 3136(%rdi), %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 3072(%rdi), %xmm2
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm26, %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm22, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 3776(%rdi), %xmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3712(%rdi), %xmm26
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 3648(%rdi), %xmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 3584(%rdi), %xmm5
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm24, %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm17 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm17, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # xmm4 = xmm4[1],mem[1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # xmm8 = xmm8[1],mem[1]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm8, %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm8, %ymm11, %ymm8
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # xmm11 = xmm11[1],mem[1]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm11, %ymm12, %ymm11
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm0 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm12, %ymm9, %ymm9
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm10, %ymm12, %ymm10
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, 448(%rsi)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, 384(%rsi)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%rsi)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%rsi)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 192(%rsi)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%rsi)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%rsi)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, (%rsi)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 448(%rdx)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 256(%rdx)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 320(%rdx)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, 128(%rdx)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, (%rdx)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 384(%rdx)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%rcx)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%rcx)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%rcx)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%rcx)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 192(%rcx)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, (%rcx)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%rcx)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%rcx)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%r8)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%r8)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%r8)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%r8)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 192(%r8)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, (%r8)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%r8)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%r8)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%r9)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%r9)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%r9)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%r9)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 192(%r9)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, (%r9)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%r9)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%r9)
+; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT:    addq $6728, %rsp # imm = 0x1A48
+; AVX512F-ONLY-FAST-NEXT:    vzeroupper
+; AVX512F-ONLY-FAST-NEXT:    retq
+;
+; AVX512DQ-SLOW-LABEL: load_i64_stride8_vf64:
+; AVX512DQ-SLOW:       # %bb.0:
+; AVX512DQ-SLOW-NEXT:    subq $6728, %rsp # imm = 0x1A48
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 3392(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 3328(%rdi), %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 3520(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 3456(%rdi), %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1856(%rdi), %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1984(%rdi), %zmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 832(%rdi), %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 768(%rdi), %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 960(%rdi), %zmm24
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 896(%rdi), %zmm10
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm12
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    movb $-64, %al
+; AVX512DQ-SLOW-NEXT:    kmovw %eax, %k1
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512DQ-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 3264(%rdi), %ymm21
+; AVX512DQ-SLOW-NEXT:    vmovdqa 3200(%rdi), %ymm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2]
+; AVX512DQ-SLOW-NEXT:    vmovdqa 3136(%rdi), %ymm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa 3072(%rdi), %ymm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 704(%rdi), %ymm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa 640(%rdi), %ymm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 576(%rdi), %ymm25
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 512(%rdi), %ymm23
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rdi), %ymm31
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdi), %ymm20
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdi), %ymm19
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1920(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1792(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 1728(%rdi), %ymm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa 1664(%rdi), %ymm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1600(%rdi), %ymm18
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1536(%rdi), %ymm26
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1472(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1408(%rdi), %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1344(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1280(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1216(%rdi), %ymm28
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1152(%rdi), %ymm29
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1088(%rdi), %ymm30
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1024(%rdi), %ymm27
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 3008(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 2944(%rdi), %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 2880(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 2816(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 2752(%rdi), %ymm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa 2688(%rdi), %ymm11
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 2624(%rdi), %ymm16
+; AVX512DQ-SLOW-NEXT:    vmovdqa 2560(%rdi), %ymm9
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 2496(%rdi), %zmm10
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 2432(%rdi), %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 2368(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 2304(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 2240(%rdi), %ymm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa 2176(%rdi), %ymm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
+; AVX512DQ-SLOW-NEXT:    vmovdqa 2112(%rdi), %ymm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa 2048(%rdi), %ymm8
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 4032(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 3968(%rdi), %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 3904(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 3840(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm1, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 3776(%rdi), %ymm22
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 3712(%rdi), %ymm17
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2]
+; AVX512DQ-SLOW-NEXT:    vmovdqa 3648(%rdi), %ymm12
+; AVX512DQ-SLOW-NEXT:    vmovdqa 3584(%rdi), %ymm0
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm14, %zmm2, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
+; AVX512DQ-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
+; AVX512DQ-SLOW-NEXT:    vmovdqu (%rsp), %ymm13 # 32-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm14
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm23, %zmm2, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm5, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm2, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm2, %zmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm28, %zmm2, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm2, %zmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm14, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm10, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm14, %zmm2, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm8, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm2 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
+; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 3136(%rdi), %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 3072(%rdi), %zmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 3264(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 3200(%rdi), %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12]
+; AVX512DQ-SLOW-NEXT:    # ymm6 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 576(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 512(%rdi), %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 704(%rdi), %zmm21
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 640(%rdi), %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1600(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1536(%rdi), %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1728(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1664(%rdi), %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm26, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1088(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1024(%rdi), %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1216(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1152(%rdi), %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm28, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 2624(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 2560(%rdi), %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 2752(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 2688(%rdi), %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 2112(%rdi), %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 2048(%rdi), %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 2240(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 2176(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm14, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 3648(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 3584(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 3776(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 3712(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm1, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
+; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm15
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13]
+; AVX512DQ-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm13
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm19
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm27
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512DQ-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm18
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm28, %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm27, %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm30
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15]
+; AVX512DQ-SLOW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm16, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8]
+; AVX512DQ-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9]
+; AVX512DQ-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm13, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm3, %zmm30
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm13, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm28, %zmm16, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm14, %zmm3, %zmm20
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm14, %zmm13, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm28, %zmm3, %zmm18
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm28, %zmm13, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm28
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm16, %zmm12
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm3, %zmm26
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm13, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm3, %zmm28
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm13, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm25
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm27, %zmm16, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm3, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm13, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm27, %zmm3, %zmm25
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm27, %zmm13, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm27
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm16, %zmm9
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm3, %zmm22
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm13, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm3, %zmm27
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm13, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm29
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm16, %zmm31
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm19
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm3, %zmm19
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm13, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm3, %zmm29
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm22
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm3, %zmm22
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm23, %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm26, %zmm15, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm13, %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm23, %zmm13, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm26, %zmm15, %zmm13
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm26, %zmm16, %zmm15
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm21
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm15
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm16, %zmm15
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm23
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,7,15]
+; AVX512DQ-SLOW-NEXT:    # ymm16 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm12, %zmm12
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm31, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm21, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm23, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rdi), %xmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rdi), %xmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rdi), %xmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm16, %ymm2
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm18, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 704(%rdi), %xmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 640(%rdi), %xmm20
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 576(%rdi), %xmm18
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 512(%rdi), %xmm16
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm31, %ymm2
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm30, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 1216(%rdi), %xmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa 1152(%rdi), %xmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1088(%rdi), %xmm23
+; AVX512DQ-SLOW-NEXT:    vmovdqa 1024(%rdi), %xmm14
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm31, %ymm30, %ymm30
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm30, %zmm25, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1728(%rdi), %xmm30
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 1664(%rdi), %xmm31
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0]
+; AVX512DQ-SLOW-NEXT:    vmovdqa 1600(%rdi), %xmm12
+; AVX512DQ-SLOW-NEXT:    vmovdqa 1536(%rdi), %xmm11
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm28, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm29 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 2240(%rdi), %xmm19
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 2176(%rdi), %xmm21
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0]
+; AVX512DQ-SLOW-NEXT:    vmovdqa 2112(%rdi), %xmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqa 2048(%rdi), %xmm1
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm25, %ymm28, %ymm25
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm25, %zmm29, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 2752(%rdi), %xmm28
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 2688(%rdi), %xmm29
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0]
+; AVX512DQ-SLOW-NEXT:    vmovdqa 2624(%rdi), %xmm10
+; AVX512DQ-SLOW-NEXT:    vmovdqa 2560(%rdi), %xmm9
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm27, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 3264(%rdi), %xmm25
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 3200(%rdi), %xmm27
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0]
+; AVX512DQ-SLOW-NEXT:    vmovdqa 3136(%rdi), %xmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa 3072(%rdi), %xmm2
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm26, %ymm4
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm22, %zmm22
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 3776(%rdi), %xmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 3712(%rdi), %xmm26
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0]
+; AVX512DQ-SLOW-NEXT:    vmovdqa 3648(%rdi), %xmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa 3584(%rdi), %xmm5
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm24, %ymm4
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm24
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm17 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm2
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm17, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # xmm4 = xmm4[1],mem[1]
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # xmm8 = xmm8[1],mem[1]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm8, %ymm4
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm8, %ymm11, %ymm8
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # xmm11 = xmm11[1],mem[1]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm11, %ymm12, %ymm11
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm0 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm12, %ymm9, %ymm9
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm10, %ymm12, %ymm10
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm10
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, 448(%rsi)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, 384(%rsi)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 320(%rsi)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 256(%rsi)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 192(%rsi)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 128(%rsi)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 64(%rsi)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, (%rsi)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, 448(%rdx)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, 256(%rdx)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, 320(%rdx)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, 128(%rdx)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, (%rdx)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, 384(%rdx)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 448(%rcx)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 256(%rcx)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 320(%rcx)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 128(%rcx)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 192(%rcx)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, (%rcx)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 64(%rcx)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 384(%rcx)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 448(%r8)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 256(%r8)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 320(%r8)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 128(%r8)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 192(%r8)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, (%r8)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 64(%r8)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 384(%r8)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 448(%r9)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 256(%r9)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 320(%r9)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 128(%r9)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 192(%r9)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, (%r9)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 64(%r9)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 384(%r9)
+; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512DQ-SLOW-NEXT:    addq $6728, %rsp # imm = 0x1A48
+; AVX512DQ-SLOW-NEXT:    vzeroupper
+; AVX512DQ-SLOW-NEXT:    retq
+;
+; AVX512DQ-FAST-LABEL: load_i64_stride8_vf64:
+; AVX512DQ-FAST:       # %bb.0:
+; AVX512DQ-FAST-NEXT:    subq $6728, %rsp # imm = 0x1A48
+; AVX512DQ-FAST-NEXT:    vmovdqa64 3392(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 3328(%rdi), %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqa64 3520(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 3456(%rdi), %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1856(%rdi), %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1984(%rdi), %zmm15
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 832(%rdi), %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 768(%rdi), %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 960(%rdi), %zmm24
+; AVX512DQ-FAST-NEXT:    vmovdqa64 896(%rdi), %zmm10
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    movb $-64, %al
+; AVX512DQ-FAST-NEXT:    kmovw %eax, %k1
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512DQ-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 3264(%rdi), %ymm21
+; AVX512DQ-FAST-NEXT:    vmovdqa 3200(%rdi), %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2]
+; AVX512DQ-FAST-NEXT:    vmovdqa 3136(%rdi), %ymm4
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa 3072(%rdi), %ymm3
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 704(%rdi), %ymm3
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa 640(%rdi), %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 576(%rdi), %ymm25
+; AVX512DQ-FAST-NEXT:    vmovdqa64 512(%rdi), %ymm23
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rdi), %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rdi), %ymm31
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %ymm20
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdi), %ymm19
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1920(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1792(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 1728(%rdi), %ymm3
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa 1664(%rdi), %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1600(%rdi), %ymm18
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1536(%rdi), %ymm26
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1472(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1408(%rdi), %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1344(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1280(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1216(%rdi), %ymm28
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1152(%rdi), %ymm29
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1088(%rdi), %ymm30
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1024(%rdi), %ymm27
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 3008(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 2944(%rdi), %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 2880(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 2816(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 2752(%rdi), %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa 2688(%rdi), %ymm11
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 2624(%rdi), %ymm16
+; AVX512DQ-FAST-NEXT:    vmovdqa 2560(%rdi), %ymm9
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 2496(%rdi), %zmm10
+; AVX512DQ-FAST-NEXT:    vmovdqa64 2432(%rdi), %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 2368(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 2304(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 2240(%rdi), %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa 2176(%rdi), %ymm3
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
+; AVX512DQ-FAST-NEXT:    vmovdqa 2112(%rdi), %ymm3
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa 2048(%rdi), %ymm8
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 4032(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 3968(%rdi), %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 3904(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 3840(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 3776(%rdi), %ymm22
+; AVX512DQ-FAST-NEXT:    vmovdqa64 3712(%rdi), %ymm17
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2]
+; AVX512DQ-FAST-NEXT:    vmovdqa 3648(%rdi), %ymm12
+; AVX512DQ-FAST-NEXT:    vmovdqa 3584(%rdi), %ymm0
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm14, %zmm2, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
+; AVX512DQ-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
+; AVX512DQ-FAST-NEXT:    vmovdqu (%rsp), %ymm13 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm14
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm2, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm5, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm2, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm2, %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm2, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm28, %zmm2, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm2, %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm14, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm2, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm10
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm10, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm14, %zmm2, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm8, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm2 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
+; AVX512DQ-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm11
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 3136(%rdi), %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 3072(%rdi), %zmm15
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm15, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 3264(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 3200(%rdi), %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12]
+; AVX512DQ-FAST-NEXT:    # ymm6 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 576(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 512(%rdi), %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 704(%rdi), %zmm21
+; AVX512DQ-FAST-NEXT:    vmovdqa64 640(%rdi), %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdi), %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1600(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1536(%rdi), %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1728(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1664(%rdi), %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1088(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1024(%rdi), %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1216(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1152(%rdi), %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm28, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 2624(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 2560(%rdi), %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 2752(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 2688(%rdi), %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 2112(%rdi), %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 2048(%rdi), %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 2240(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 2176(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm3
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm14, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 3648(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 3584(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 3776(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 3712(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm1, %zmm6
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
+; AVX512DQ-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm15
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13]
+; AVX512DQ-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm0, %zmm13
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm19
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm27
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512DQ-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm18
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm28, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm27, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm1, %zmm8
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm30
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15]
+; AVX512DQ-FAST-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm16, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8]
+; AVX512DQ-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9]
+; AVX512DQ-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm13, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm3, %zmm30
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm13, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm28, %zmm16, %zmm1
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm14, %zmm3, %zmm20
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm14, %zmm13, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm28, %zmm3, %zmm18
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm28, %zmm13, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm28
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm16, %zmm12
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm3, %zmm26
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm13, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm3, %zmm28
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm13, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm25
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm27, %zmm16, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, %zmm7
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm3, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm13, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm27, %zmm3, %zmm25
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm27, %zmm13, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm27
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm16, %zmm9
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm3, %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm13, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm3, %zmm27
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm13, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm29
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm16, %zmm31
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm19
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm3, %zmm19
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm13, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm3, %zmm29
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm3, %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm26, %zmm15, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm13, %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm13, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm26, %zmm15, %zmm13
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm16, %zmm15
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm15, %zmm21
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm15
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm16, %zmm15
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm15, %zmm23
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,7,15]
+; AVX512DQ-FAST-NEXT:    # ymm16 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm12, %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm31, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm21, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm23, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rdi), %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rdi), %xmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rdi), %xmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm16, %ymm2
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm18, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 704(%rdi), %xmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 640(%rdi), %xmm20
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 576(%rdi), %xmm18
+; AVX512DQ-FAST-NEXT:    vmovdqa64 512(%rdi), %xmm16
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm31, %ymm2
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm30, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 1216(%rdi), %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa 1152(%rdi), %xmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1088(%rdi), %xmm23
+; AVX512DQ-FAST-NEXT:    vmovdqa 1024(%rdi), %xmm14
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm31, %ymm30, %ymm30
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm30, %zmm25, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1728(%rdi), %xmm30
+; AVX512DQ-FAST-NEXT:    vmovdqa64 1664(%rdi), %xmm31
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0]
+; AVX512DQ-FAST-NEXT:    vmovdqa 1600(%rdi), %xmm12
+; AVX512DQ-FAST-NEXT:    vmovdqa 1536(%rdi), %xmm11
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm28, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm29 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 2240(%rdi), %xmm19
+; AVX512DQ-FAST-NEXT:    vmovdqa64 2176(%rdi), %xmm21
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0]
+; AVX512DQ-FAST-NEXT:    vmovdqa 2112(%rdi), %xmm15
+; AVX512DQ-FAST-NEXT:    vmovdqa 2048(%rdi), %xmm1
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm25, %ymm28, %ymm25
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm25, %zmm29, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 2752(%rdi), %xmm28
+; AVX512DQ-FAST-NEXT:    vmovdqa64 2688(%rdi), %xmm29
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0]
+; AVX512DQ-FAST-NEXT:    vmovdqa 2624(%rdi), %xmm10
+; AVX512DQ-FAST-NEXT:    vmovdqa 2560(%rdi), %xmm9
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm27, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 3264(%rdi), %xmm25
+; AVX512DQ-FAST-NEXT:    vmovdqa64 3200(%rdi), %xmm27
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0]
+; AVX512DQ-FAST-NEXT:    vmovdqa 3136(%rdi), %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa 3072(%rdi), %xmm2
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm26, %ymm4
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm22, %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 3776(%rdi), %xmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 3712(%rdi), %xmm26
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0]
+; AVX512DQ-FAST-NEXT:    vmovdqa 3648(%rdi), %xmm6
+; AVX512DQ-FAST-NEXT:    vmovdqa 3584(%rdi), %xmm5
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm24, %ymm4
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm24
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm17 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm2
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm17, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # xmm4 = xmm4[1],mem[1]
+; AVX512DQ-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # xmm8 = xmm8[1],mem[1]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm8, %ymm4
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm8, %ymm11, %ymm8
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # xmm11 = xmm11[1],mem[1]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm11, %ymm12, %ymm11
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm0 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm12, %ymm9, %ymm9
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm10, %ymm12, %ymm10
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm10
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, 448(%rsi)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, 384(%rsi)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 320(%rsi)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 256(%rsi)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 192(%rsi)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 128(%rsi)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 64(%rsi)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, (%rsi)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, 448(%rdx)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, 256(%rdx)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, 320(%rdx)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, 128(%rdx)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, (%rdx)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, 384(%rdx)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 448(%rcx)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 256(%rcx)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 320(%rcx)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 128(%rcx)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 192(%rcx)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, (%rcx)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 64(%rcx)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 384(%rcx)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 448(%r8)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 256(%r8)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 320(%r8)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 128(%r8)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 192(%r8)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, (%r8)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 64(%r8)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 384(%r8)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 448(%r9)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 256(%r9)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 320(%r9)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 128(%r9)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 192(%r9)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, (%r9)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 64(%r9)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 384(%r9)
+; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512DQ-FAST-NEXT:    addq $6728, %rsp # imm = 0x1A48
+; AVX512DQ-FAST-NEXT:    vzeroupper
+; AVX512DQ-FAST-NEXT:    retq
+;
+; AVX512BW-ONLY-SLOW-LABEL: load_i64_stride8_vf64:
+; AVX512BW-ONLY-SLOW:       # %bb.0:
+; AVX512BW-ONLY-SLOW-NEXT:    subq $6728, %rsp # imm = 0x1A48
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3392(%rdi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3328(%rdi), %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3520(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3456(%rdi), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1856(%rdi), %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1984(%rdi), %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 832(%rdi), %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 768(%rdi), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 960(%rdi), %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 896(%rdi), %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    movb $-64, %al
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %eax, %k1
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3264(%rdi), %ymm21
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 3200(%rdi), %ymm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 3136(%rdi), %ymm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 3072(%rdi), %ymm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 704(%rdi), %ymm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 640(%rdi), %ymm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 576(%rdi), %ymm25
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 512(%rdi), %ymm23
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %ymm31
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %ymm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %ymm19
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1920(%rdi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1792(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 1728(%rdi), %ymm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 1664(%rdi), %ymm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1600(%rdi), %ymm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1536(%rdi), %ymm26
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1472(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1408(%rdi), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1344(%rdi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1280(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1216(%rdi), %ymm28
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1152(%rdi), %ymm29
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1088(%rdi), %ymm30
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1024(%rdi), %ymm27
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3008(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2944(%rdi), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2880(%rdi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2816(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 2752(%rdi), %ymm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 2688(%rdi), %ymm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2624(%rdi), %ymm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 2560(%rdi), %ymm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2496(%rdi), %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2432(%rdi), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2368(%rdi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2304(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 2240(%rdi), %ymm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 2176(%rdi), %ymm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 2112(%rdi), %ymm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 2048(%rdi), %ymm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 4032(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3968(%rdi), %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3904(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3840(%rdi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3776(%rdi), %ymm22
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3712(%rdi), %ymm17
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 3648(%rdi), %ymm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 3584(%rdi), %ymm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm14, %zmm2, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu (%rsp), %ymm13 # 32-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm2, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm5, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm2, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm2, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm2, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm2, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm14, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm10, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm2, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm8, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm2 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3136(%rdi), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3072(%rdi), %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3264(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3200(%rdi), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm6 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 576(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 512(%rdi), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 704(%rdi), %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 640(%rdi), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1600(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1536(%rdi), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1728(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1664(%rdi), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1088(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1024(%rdi), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1216(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1152(%rdi), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2624(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2560(%rdi), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2752(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2688(%rdi), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2112(%rdi), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2048(%rdi), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2240(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2176(%rdi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3648(%rdi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3584(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3776(%rdi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3712(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm1, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm16, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm13, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm3, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm13, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm16, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm3, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm13, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm3, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm13, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm16, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm3, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm13, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm3, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm13, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm16, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm3, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm13, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm3, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm13, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm16, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm3, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm13, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm3, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm13, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm16, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm3, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm13, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm3, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm3, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm3, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm3, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm26, %zmm15, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm13, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm13, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm26, %zmm15, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm16, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm16, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,7,15]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm12, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm31, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm21, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm23, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rdi), %xmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rdi), %xmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rdi), %xmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm16, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm18, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 704(%rdi), %xmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 640(%rdi), %xmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 576(%rdi), %xmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 512(%rdi), %xmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm31, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm30, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 1216(%rdi), %xmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 1152(%rdi), %xmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1088(%rdi), %xmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 1024(%rdi), %xmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm31, %ymm30, %ymm30
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm30, %zmm25, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1728(%rdi), %xmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1664(%rdi), %xmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 1600(%rdi), %xmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 1536(%rdi), %xmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm28, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm29 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2240(%rdi), %xmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2176(%rdi), %xmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 2112(%rdi), %xmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 2048(%rdi), %xmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm25, %ymm28, %ymm25
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm25, %zmm29, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2752(%rdi), %xmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2688(%rdi), %xmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 2624(%rdi), %xmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 2560(%rdi), %xmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm27, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3264(%rdi), %xmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3200(%rdi), %xmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 3136(%rdi), %xmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 3072(%rdi), %xmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm26, %ymm4
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm22, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 3776(%rdi), %xmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3712(%rdi), %xmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 3648(%rdi), %xmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 3584(%rdi), %xmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm24, %ymm4
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm17 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm17, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # xmm4 = xmm4[1],mem[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # xmm8 = xmm8[1],mem[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm8, %ymm4
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm8, %ymm11, %ymm8
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # xmm11 = xmm11[1],mem[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm11, %ymm12, %ymm11
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm0 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm12, %ymm9, %ymm9
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm10, %ymm12, %ymm10
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, 448(%rsi)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, 384(%rsi)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%rsi)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%rsi)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 192(%rsi)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%rsi)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%rsi)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%rsi)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 448(%rdx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 256(%rdx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 320(%rdx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, 128(%rdx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, (%rdx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 384(%rdx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%rcx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%rcx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%rcx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%rcx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 192(%rcx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%rcx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%rcx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 384(%rcx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%r8)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%r8)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%r8)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%r8)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 192(%r8)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%r8)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%r8)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 384(%r8)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%r9)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%r9)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%r9)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%r9)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 192(%r9)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%r9)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%r9)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 384(%r9)
+; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    addq $6728, %rsp # imm = 0x1A48
+; AVX512BW-ONLY-SLOW-NEXT:    vzeroupper
+; AVX512BW-ONLY-SLOW-NEXT:    retq
+;
+; AVX512BW-ONLY-FAST-LABEL: load_i64_stride8_vf64:
+; AVX512BW-ONLY-FAST:       # %bb.0:
+; AVX512BW-ONLY-FAST-NEXT:    subq $6728, %rsp # imm = 0x1A48
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3392(%rdi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3328(%rdi), %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3520(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3456(%rdi), %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1856(%rdi), %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1984(%rdi), %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 832(%rdi), %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 768(%rdi), %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 960(%rdi), %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 896(%rdi), %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    movb $-64, %al
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %eax, %k1
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3264(%rdi), %ymm21
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 3200(%rdi), %ymm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 3136(%rdi), %ymm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 3072(%rdi), %ymm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 704(%rdi), %ymm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 640(%rdi), %ymm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 576(%rdi), %ymm25
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 512(%rdi), %ymm23
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rdi), %ymm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %ymm31
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %ymm20
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %ymm19
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1920(%rdi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1792(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 1728(%rdi), %ymm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 1664(%rdi), %ymm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1600(%rdi), %ymm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1536(%rdi), %ymm26
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1472(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1408(%rdi), %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1344(%rdi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1280(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1216(%rdi), %ymm28
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1152(%rdi), %ymm29
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1088(%rdi), %ymm30
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1024(%rdi), %ymm27
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3008(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2944(%rdi), %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2880(%rdi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2816(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 2752(%rdi), %ymm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 2688(%rdi), %ymm11
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2624(%rdi), %ymm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 2560(%rdi), %ymm9
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2496(%rdi), %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2432(%rdi), %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2368(%rdi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2304(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 2240(%rdi), %ymm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 2176(%rdi), %ymm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 2112(%rdi), %ymm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 2048(%rdi), %ymm8
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 4032(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3968(%rdi), %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3904(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3840(%rdi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3776(%rdi), %ymm22
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3712(%rdi), %ymm17
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 3648(%rdi), %ymm12
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 3584(%rdi), %ymm0
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm14, %zmm2, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu (%rsp), %ymm13 # 32-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm2, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm5, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm2, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm2, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm2, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm2, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm2, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm14, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm2, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm10, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm2, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm8, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm2 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3136(%rdi), %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3072(%rdi), %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3264(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3200(%rdi), %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm6 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 576(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 512(%rdi), %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 704(%rdi), %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 640(%rdi), %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1600(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1536(%rdi), %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1728(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1664(%rdi), %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1088(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1024(%rdi), %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1216(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1152(%rdi), %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2624(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2560(%rdi), %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2752(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2688(%rdi), %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2112(%rdi), %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2048(%rdi), %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2240(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2176(%rdi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3648(%rdi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3584(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3776(%rdi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3712(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm1, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm0, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm1, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm16, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm13, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm3, %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm13, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm16, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm3, %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm13, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm3, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm13, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm16, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm3, %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm13, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm3, %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm13, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm16, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm3, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm13, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm3, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm13, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm16, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm3, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm13, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm3, %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm13, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm16, %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm3, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm13, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm3, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm3, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm26, %zmm15, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm13, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm13, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm26, %zmm15, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm16, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm16, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,7,15]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm12, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm31, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm21, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm23, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rdi), %xmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rdi), %xmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rdi), %xmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm16, %ymm2
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm18, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 704(%rdi), %xmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 640(%rdi), %xmm20
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 576(%rdi), %xmm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 512(%rdi), %xmm16
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm31, %ymm2
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm30, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 1216(%rdi), %xmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 1152(%rdi), %xmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1088(%rdi), %xmm23
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 1024(%rdi), %xmm14
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm31, %ymm30, %ymm30
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm30, %zmm25, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1728(%rdi), %xmm30
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1664(%rdi), %xmm31
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 1600(%rdi), %xmm12
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 1536(%rdi), %xmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm28, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm29 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2240(%rdi), %xmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2176(%rdi), %xmm21
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 2112(%rdi), %xmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 2048(%rdi), %xmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm25, %ymm28, %ymm25
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm25, %zmm29, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2752(%rdi), %xmm28
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2688(%rdi), %xmm29
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 2624(%rdi), %xmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 2560(%rdi), %xmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm27, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3264(%rdi), %xmm25
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3200(%rdi), %xmm27
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 3136(%rdi), %xmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 3072(%rdi), %xmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm26, %ymm4
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm22, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 3776(%rdi), %xmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3712(%rdi), %xmm26
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 3648(%rdi), %xmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 3584(%rdi), %xmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm24, %ymm4
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm17 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm2
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm17, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # xmm4 = xmm4[1],mem[1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # xmm8 = xmm8[1],mem[1]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm8, %ymm4
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm8, %ymm11, %ymm8
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # xmm11 = xmm11[1],mem[1]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm11, %ymm12, %ymm11
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm0 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm12, %ymm9, %ymm9
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm10, %ymm12, %ymm10
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, 448(%rsi)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, 384(%rsi)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%rsi)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%rsi)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 192(%rsi)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%rsi)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%rsi)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, (%rsi)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 448(%rdx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 256(%rdx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 320(%rdx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, 128(%rdx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, (%rdx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 384(%rdx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%rcx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%rcx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%rcx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%rcx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 192(%rcx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, (%rcx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%rcx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%rcx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%r8)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%r8)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%r8)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%r8)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 192(%r8)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, (%r8)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%r8)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%r8)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%r9)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%r9)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%r9)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%r9)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 192(%r9)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, (%r9)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%r9)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%r9)
+; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    addq $6728, %rsp # imm = 0x1A48
+; AVX512BW-ONLY-FAST-NEXT:    vzeroupper
+; AVX512BW-ONLY-FAST-NEXT:    retq
+;
+; AVX512DQBW-SLOW-LABEL: load_i64_stride8_vf64:
+; AVX512DQBW-SLOW:       # %bb.0:
+; AVX512DQBW-SLOW-NEXT:    subq $6728, %rsp # imm = 0x1A48
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3392(%rdi), %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3328(%rdi), %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3520(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3456(%rdi), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1856(%rdi), %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1984(%rdi), %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 832(%rdi), %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 768(%rdi), %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 960(%rdi), %zmm24
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 896(%rdi), %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    movb $-64, %al
+; AVX512DQBW-SLOW-NEXT:    kmovd %eax, %k1
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512DQBW-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3264(%rdi), %ymm21
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 3200(%rdi), %ymm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 3136(%rdi), %ymm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 3072(%rdi), %ymm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 704(%rdi), %ymm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 640(%rdi), %ymm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 576(%rdi), %ymm25
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 512(%rdi), %ymm23
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rdi), %ymm31
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %ymm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdi), %ymm19
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1920(%rdi), %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1792(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 1728(%rdi), %ymm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 1664(%rdi), %ymm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1600(%rdi), %ymm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1536(%rdi), %ymm26
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1472(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1408(%rdi), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1344(%rdi), %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1280(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1216(%rdi), %ymm28
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1152(%rdi), %ymm29
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1088(%rdi), %ymm30
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1024(%rdi), %ymm27
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3008(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2944(%rdi), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2880(%rdi), %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2816(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 2752(%rdi), %ymm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 2688(%rdi), %ymm11
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2624(%rdi), %ymm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 2560(%rdi), %ymm9
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2496(%rdi), %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2432(%rdi), %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2368(%rdi), %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2304(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 2240(%rdi), %ymm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 2176(%rdi), %ymm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 2112(%rdi), %ymm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 2048(%rdi), %ymm8
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 4032(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3968(%rdi), %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3904(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3840(%rdi), %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm1, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3776(%rdi), %ymm22
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3712(%rdi), %ymm17
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 3648(%rdi), %ymm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 3584(%rdi), %ymm0
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm14, %zmm2, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
+; AVX512DQBW-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu (%rsp), %ymm13 # 32-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm23, %zmm2, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm5, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm2, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm2, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm28, %zmm2, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm2, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm14, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm10, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm14, %zmm2, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm3, %zmm8, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm2 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
+; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3136(%rdi), %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3072(%rdi), %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3264(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3200(%rdi), %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12]
+; AVX512DQBW-SLOW-NEXT:    # ymm6 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 576(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 512(%rdi), %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 704(%rdi), %zmm21
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 640(%rdi), %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1600(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1536(%rdi), %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1728(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1664(%rdi), %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm26, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1088(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1024(%rdi), %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1216(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1152(%rdi), %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm28, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2624(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2560(%rdi), %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2752(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2688(%rdi), %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2112(%rdi), %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2048(%rdi), %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2240(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2176(%rdi), %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm14, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3648(%rdi), %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3584(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm3, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3776(%rdi), %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3712(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm3, %zmm1, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
+; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13]
+; AVX512DQBW-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512DQBW-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm28, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm27, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm30
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15]
+; AVX512DQBW-SLOW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm16, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8]
+; AVX512DQBW-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9]
+; AVX512DQBW-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm13, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm3, %zmm30
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm13, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm28, %zmm16, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm14, %zmm3, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm14, %zmm13, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm28, %zmm3, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm28, %zmm13, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm28
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm16, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm3, %zmm26
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm13, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm3, %zmm28
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm13, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm27, %zmm16, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm3, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm13, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm27, %zmm3, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm27, %zmm13, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm16, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm3, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm13, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm3, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm13, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm16, %zmm31
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm3, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm13, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm3, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm3, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm23, %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm26, %zmm15, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm13, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm23, %zmm13, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm26, %zmm15, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm26, %zmm16, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm16, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,7,15]
+; AVX512DQBW-SLOW-NEXT:    # ymm16 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm12, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm31, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm21, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm23, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rdi), %xmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rdi), %xmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rdi), %xmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm16, %ymm2
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm18, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 704(%rdi), %xmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 640(%rdi), %xmm20
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 576(%rdi), %xmm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 512(%rdi), %xmm16
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm31, %ymm2
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm30, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 1216(%rdi), %xmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 1152(%rdi), %xmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1088(%rdi), %xmm23
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 1024(%rdi), %xmm14
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm31, %ymm30, %ymm30
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm30, %zmm25, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1728(%rdi), %xmm30
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1664(%rdi), %xmm31
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 1600(%rdi), %xmm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 1536(%rdi), %xmm11
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm28, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm29 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2240(%rdi), %xmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2176(%rdi), %xmm21
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 2112(%rdi), %xmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 2048(%rdi), %xmm1
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm25, %ymm28, %ymm25
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm25, %zmm29, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2752(%rdi), %xmm28
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2688(%rdi), %xmm29
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 2624(%rdi), %xmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 2560(%rdi), %xmm9
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm27, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3264(%rdi), %xmm25
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3200(%rdi), %xmm27
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 3136(%rdi), %xmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 3072(%rdi), %xmm2
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm26, %ymm4
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm22, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 3776(%rdi), %xmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3712(%rdi), %xmm26
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 3648(%rdi), %xmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 3584(%rdi), %xmm5
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm24, %ymm4
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm17 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm2
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm17, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # xmm4 = xmm4[1],mem[1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # xmm8 = xmm8[1],mem[1]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm8, %ymm4
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm8, %ymm11, %ymm8
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # xmm11 = xmm11[1],mem[1]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm11, %ymm12, %ymm11
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm0 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm12, %ymm9, %ymm9
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm10, %ymm12, %ymm10
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, 448(%rsi)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, 384(%rsi)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 320(%rsi)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 256(%rsi)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 192(%rsi)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 128(%rsi)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 64(%rsi)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, (%rsi)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, 448(%rdx)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, 256(%rdx)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, 320(%rdx)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, 128(%rdx)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, (%rdx)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, 384(%rdx)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 448(%rcx)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 256(%rcx)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 320(%rcx)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 128(%rcx)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 192(%rcx)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, (%rcx)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 64(%rcx)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 384(%rcx)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 448(%r8)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 256(%r8)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 320(%r8)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 128(%r8)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 192(%r8)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, (%r8)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 64(%r8)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 384(%r8)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 448(%r9)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 256(%r9)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 320(%r9)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 128(%r9)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 192(%r9)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, (%r9)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 64(%r9)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 384(%r9)
+; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512DQBW-SLOW-NEXT:    addq $6728, %rsp # imm = 0x1A48
+; AVX512DQBW-SLOW-NEXT:    vzeroupper
+; AVX512DQBW-SLOW-NEXT:    retq
+;
+; AVX512DQBW-FAST-LABEL: load_i64_stride8_vf64:
+; AVX512DQBW-FAST:       # %bb.0:
+; AVX512DQBW-FAST-NEXT:    subq $6728, %rsp # imm = 0x1A48
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 3392(%rdi), %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 3328(%rdi), %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 3520(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 3456(%rdi), %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1856(%rdi), %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1984(%rdi), %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 832(%rdi), %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 768(%rdi), %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 960(%rdi), %zmm24
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 896(%rdi), %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    movb $-64, %al
+; AVX512DQBW-FAST-NEXT:    kmovd %eax, %k1
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512DQBW-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 3264(%rdi), %ymm21
+; AVX512DQBW-FAST-NEXT:    vmovdqa 3200(%rdi), %ymm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2]
+; AVX512DQBW-FAST-NEXT:    vmovdqa 3136(%rdi), %ymm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa 3072(%rdi), %ymm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 704(%rdi), %ymm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa 640(%rdi), %ymm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 576(%rdi), %ymm25
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 512(%rdi), %ymm23
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rdi), %ymm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rdi), %ymm31
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdi), %ymm20
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdi), %ymm19
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1920(%rdi), %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1792(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 1728(%rdi), %ymm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa 1664(%rdi), %ymm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1600(%rdi), %ymm18
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1536(%rdi), %ymm26
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1472(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1408(%rdi), %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1344(%rdi), %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1280(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1216(%rdi), %ymm28
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1152(%rdi), %ymm29
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1088(%rdi), %ymm30
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1024(%rdi), %ymm27
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 3008(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 2944(%rdi), %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 2880(%rdi), %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 2816(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 2752(%rdi), %ymm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa 2688(%rdi), %ymm11
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 2624(%rdi), %ymm16
+; AVX512DQBW-FAST-NEXT:    vmovdqa 2560(%rdi), %ymm9
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 2496(%rdi), %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 2432(%rdi), %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 2368(%rdi), %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 2304(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 2240(%rdi), %ymm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa 2176(%rdi), %ymm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
+; AVX512DQBW-FAST-NEXT:    vmovdqa 2112(%rdi), %ymm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa 2048(%rdi), %ymm8
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 4032(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 3968(%rdi), %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 3904(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 3840(%rdi), %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm1, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 3776(%rdi), %ymm22
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 3712(%rdi), %ymm17
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2]
+; AVX512DQBW-FAST-NEXT:    vmovdqa 3648(%rdi), %ymm12
+; AVX512DQBW-FAST-NEXT:    vmovdqa 3584(%rdi), %ymm0
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm14, %zmm2, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
+; AVX512DQBW-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
+; AVX512DQBW-FAST-NEXT:    vmovdqu (%rsp), %ymm13 # 32-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm14
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm2, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm5, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm2, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm2, %zmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm2, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm28, %zmm2, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm2, %zmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm14, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm2, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm10, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm14, %zmm2, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm8, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm2 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
+; AVX512DQBW-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 3136(%rdi), %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 3072(%rdi), %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm15, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 3264(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 3200(%rdi), %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12]
+; AVX512DQBW-FAST-NEXT:    # ymm6 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 576(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 512(%rdi), %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 704(%rdi), %zmm21
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 640(%rdi), %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1600(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1536(%rdi), %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1728(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1664(%rdi), %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1088(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1024(%rdi), %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1216(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1152(%rdi), %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm28, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 2624(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 2560(%rdi), %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 2752(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 2688(%rdi), %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 2112(%rdi), %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 2048(%rdi), %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 2240(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 2176(%rdi), %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm14, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 3648(%rdi), %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 3584(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 3776(%rdi), %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 3712(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm1, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
+; AVX512DQBW-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm15
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13]
+; AVX512DQBW-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm0, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm19
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm14, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm27
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512DQBW-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm18
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm28, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm27, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm1, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm30
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15]
+; AVX512DQBW-FAST-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm16, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8]
+; AVX512DQBW-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9]
+; AVX512DQBW-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm13, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm3, %zmm30
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm13, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm28, %zmm16, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm14, %zmm3, %zmm20
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm14, %zmm13, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm28, %zmm3, %zmm18
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm28, %zmm13, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm28
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm16, %zmm12
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm3, %zmm26
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm13, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm3, %zmm28
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm13, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm25
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm27, %zmm16, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm3, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm13, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm27, %zmm3, %zmm25
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm27, %zmm13, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm27
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm16, %zmm9
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm3, %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm13, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm3, %zmm27
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm13, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm29
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm16, %zmm31
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm19
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm3, %zmm19
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm13, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm3, %zmm29
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm3, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm3, %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm3, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm26, %zmm15, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm13, %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm13, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm26, %zmm15, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm16, %zmm15
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm15, %zmm21
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm15
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm16, %zmm15
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm15, %zmm23
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,7,15]
+; AVX512DQBW-FAST-NEXT:    # ymm16 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm12, %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm31, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm21, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm23, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rdi), %xmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rdi), %xmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rdi), %xmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm16, %ymm2
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm18, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 704(%rdi), %xmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 640(%rdi), %xmm20
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 576(%rdi), %xmm18
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 512(%rdi), %xmm16
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm31, %ymm2
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm30, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 1216(%rdi), %xmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa 1152(%rdi), %xmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1088(%rdi), %xmm23
+; AVX512DQBW-FAST-NEXT:    vmovdqa 1024(%rdi), %xmm14
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm31, %ymm30, %ymm30
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm30, %zmm25, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1728(%rdi), %xmm30
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 1664(%rdi), %xmm31
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0]
+; AVX512DQBW-FAST-NEXT:    vmovdqa 1600(%rdi), %xmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqa 1536(%rdi), %xmm11
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm28, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, %zmm29 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 2240(%rdi), %xmm19
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 2176(%rdi), %xmm21
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0]
+; AVX512DQBW-FAST-NEXT:    vmovdqa 2112(%rdi), %xmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqa 2048(%rdi), %xmm1
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm25, %ymm28, %ymm25
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm25, %zmm29, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 2752(%rdi), %xmm28
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 2688(%rdi), %xmm29
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0]
+; AVX512DQBW-FAST-NEXT:    vmovdqa 2624(%rdi), %xmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqa 2560(%rdi), %xmm9
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm27, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 3264(%rdi), %xmm25
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 3200(%rdi), %xmm27
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0]
+; AVX512DQBW-FAST-NEXT:    vmovdqa 3136(%rdi), %xmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa 3072(%rdi), %xmm2
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm26, %ymm4
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm22, %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 3776(%rdi), %xmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 3712(%rdi), %xmm26
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0]
+; AVX512DQBW-FAST-NEXT:    vmovdqa 3648(%rdi), %xmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa 3584(%rdi), %xmm5
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm24, %ymm4
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm24
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm17 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm2
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm17, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # xmm4 = xmm4[1],mem[1]
+; AVX512DQBW-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # xmm8 = xmm8[1],mem[1]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm8, %ymm4
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm8, %ymm11, %ymm8
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # xmm11 = xmm11[1],mem[1]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm11, %ymm12, %ymm11
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm0 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm12, %ymm9, %ymm9
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm10, %ymm12, %ymm10
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, 448(%rsi)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, 384(%rsi)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 320(%rsi)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 256(%rsi)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 192(%rsi)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 128(%rsi)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 64(%rsi)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, (%rsi)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, 448(%rdx)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, 256(%rdx)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, 320(%rdx)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, 128(%rdx)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, (%rdx)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, 384(%rdx)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 448(%rcx)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 256(%rcx)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 320(%rcx)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 128(%rcx)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 192(%rcx)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, (%rcx)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 64(%rcx)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 384(%rcx)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 448(%r8)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 256(%r8)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 320(%r8)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 128(%r8)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 192(%r8)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, (%r8)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 64(%r8)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 384(%r8)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 448(%r9)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 256(%r9)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 320(%r9)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 128(%r9)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 192(%r9)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, (%r9)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 64(%r9)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 384(%r9)
+; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512DQBW-FAST-NEXT:    addq $6728, %rsp # imm = 0x1A48
+; AVX512DQBW-FAST-NEXT:    vzeroupper
+; AVX512DQBW-FAST-NEXT:    retq
   %wide.vec = load <512 x i64>, ptr %in.vec, align 64
   %strided.vec0 = shufflevector <512 x i64> %wide.vec, <512 x i64> poison, <64 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120, i32 128, i32 136, i32 144, i32 152, i32 160, i32 168, i32 176, i32 184, i32 192, i32 200, i32 208, i32 216, i32 224, i32 232, i32 240, i32 248, i32 256, i32 264, i32 272, i32 280, i32 288, i32 296, i32 304, i32 312, i32 320, i32 328, i32 336, i32 344, i32 352, i32 360, i32 368, i32 376, i32 384, i32 392, i32 400, i32 408, i32 416, i32 424, i32 432, i32 440, i32 448, i32 456, i32 464, i32 472, i32 480, i32 488, i32 496, i32 504>
   %strided.vec1 = shufflevector <512 x i64> %wide.vec, <512 x i64> poison, <64 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 65, i32 73, i32 81, i32 89, i32 97, i32 105, i32 113, i32 121, i32 129, i32 137, i32 145, i32 153, i32 161, i32 169, i32 177, i32 185, i32 193, i32 201, i32 209, i32 217, i32 225, i32 233, i32 241, i32 249, i32 257, i32 265, i32 273, i32 281, i32 289, i32 297, i32 305, i32 313, i32 321, i32 329, i32 337, i32 345, i32 353, i32 361, i32 369, i32 377, i32 385, i32 393, i32 401, i32 409, i32 417, i32 425, i32 433, i32 441, i32 449, i32 457, i32 465, i32 473, i32 481, i32 489, i32 497, i32 505>
@@ -11928,16 +18774,8 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE: {{.*}}
 ; AVX2-SLOW: {{.*}}
 ; AVX512BW-FAST: {{.*}}
-; AVX512BW-ONLY-FAST: {{.*}}
-; AVX512BW-ONLY-SLOW: {{.*}}
 ; AVX512BW-SLOW: {{.*}}
-; AVX512DQ-FAST: {{.*}}
-; AVX512DQ-SLOW: {{.*}}
-; AVX512DQBW-FAST: {{.*}}
-; AVX512DQBW-SLOW: {{.*}}
 ; AVX512F-FAST: {{.*}}
-; AVX512F-ONLY-FAST: {{.*}}
-; AVX512F-ONLY-SLOW: {{.*}}
 ; AVX512F-SLOW: {{.*}}
 ; FALLBACK0: {{.*}}
 ; FALLBACK1: {{.*}}

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll
index 23f26672fe7d0..84f695245dbb1 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll
@@ -174,31 +174,47 @@ define void @load_i8_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; SSE-NEXT:    movdqa %xmm0, (%rdx)
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: load_i8_stride2_vf16:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX1-NEXT:    vpand %xmm0, %xmm2, %xmm3
-; AVX1-NEXT:    vpand %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX1-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX1-NEXT:    vmovdqa %xmm1, (%rdx)
-; AVX1-NEXT:    retq
+; AVX1-ONLY-LABEL: load_i8_stride2_vf16:
+; AVX1-ONLY:       # %bb.0:
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
+; AVX1-ONLY-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX1-ONLY-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX1-ONLY-NEXT:    vpand %xmm0, %xmm2, %xmm3
+; AVX1-ONLY-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; AVX1-ONLY-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-ONLY-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-ONLY-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX1-ONLY-NEXT:    vmovdqa %xmm1, (%rdx)
+; AVX1-ONLY-NEXT:    retq
+;
+; AVX2-ONLY-LABEL: load_i8_stride2_vf16:
+; AVX2-ONLY:       # %bb.0:
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
+; AVX2-ONLY-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX2-ONLY-NEXT:    vpand %xmm0, %xmm2, %xmm3
+; AVX2-ONLY-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; AVX2-ONLY-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX2-ONLY-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-ONLY-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-ONLY-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-ONLY-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX2-ONLY-NEXT:    vmovdqa %xmm1, (%rdx)
+; AVX2-ONLY-NEXT:    retq
 ;
 ; AVX512F-LABEL: load_i8_stride2_vf16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm2
 ; AVX512F-NEXT:    vpand %xmm0, %xmm2, %xmm3
 ; AVX512F-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; AVX512F-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
@@ -450,9 +466,9 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX2-ONLY-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; AVX2-ONLY-NEXT:    vmovdqa 64(%rdi), %ymm2
 ; AVX2-ONLY-NEXT:    vmovdqa 96(%rdi), %ymm3
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14>
+; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm4, %ymm3, %ymm5
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm6, %ymm2, %ymm7
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7]
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,2,1,3]
@@ -460,9 +476,9 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX2-ONLY-NEXT:    vpshufb %ymm6, %ymm0, %ymm6
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15>
+; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm6, %ymm3, %ymm3
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm7 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm7, %ymm2, %ymm2
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7]
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
@@ -479,13 +495,13 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ;
 ; AVX512F-SLOW-LABEL: load_i8_stride2_vf64:
 ; AVX512F-SLOW:       # %bb.0:
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
 ; AVX512F-SLOW-NEXT:    vmovdqa (%rdi), %ymm1
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; AVX512F-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm3
 ; AVX512F-SLOW-NEXT:    vmovdqa 96(%rdi), %ymm4
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm0, %ymm4, %ymm5
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm6, %ymm3, %ymm7
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
@@ -493,9 +509,9 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3],ymm6[4,5],ymm0[6,7]
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm0
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[0,2,1,3,4,6,5,7]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm6, %ymm3, %ymm3
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
@@ -510,7 +526,7 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ;
 ; AVX512F-FAST-LABEL: load_i8_stride2_vf64:
 ; AVX512F-FAST:       # %bb.0:
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
 ; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %ymm1
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; AVX512F-FAST-NEXT:    vmovdqa 64(%rdi), %ymm3
@@ -518,17 +534,17 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX512F-FAST-NEXT:    vpshufb %ymm0, %ymm4, %ymm5
 ; AVX512F-FAST-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm0
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm3, %ymm6
 ; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm1, %ymm5
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm5
 ; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,2,9,11,4,6,13,15]
 ; AVX512F-FAST-NEXT:    vpermt2q %zmm0, %zmm6, %zmm5
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm0, %ymm4, %ymm4
 ; AVX512F-FAST-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm3, %ymm3
 ; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
index 21b6e38f3f09a..f0118bc3b33b6 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
@@ -680,7 +680,8 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vinserti128 $1, 48(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vinserti128 $1, 64(%rdi), %ymm1, %ymm1
 ; AVX2-ONLY-NEXT:    vinserti128 $1, 80(%rdi), %ymm2, %ymm2
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX2-ONLY-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX2-ONLY-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX2-ONLY-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
@@ -688,7 +689,8 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
 ; AVX2-ONLY-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
 ; AVX2-ONLY-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; AVX2-ONLY-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; AVX2-ONLY-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm1
 ; AVX2-ONLY-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
 ; AVX2-ONLY-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
@@ -706,7 +708,8 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-NEXT:    vinserti128 $1, 48(%rdi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti128 $1, 64(%rdi), %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti128 $1, 80(%rdi), %ymm2, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
@@ -733,7 +736,8 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vinserti128 $1, 48(%rdi), %ymm0, %ymm0
 ; AVX512BW-NEXT:    vinserti128 $1, 64(%rdi), %ymm1, %ymm1
 ; AVX512BW-NEXT:    vinserti128 $1, 80(%rdi), %ymm2, %ymm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX512BW-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
@@ -1369,7 +1373,8 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255]
 ; AVX2-ONLY-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm5, %ymm6, %ymm1, %ymm7
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm8 = [1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14]
+; AVX2-ONLY-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14]
+; AVX2-ONLY-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm8, %ymm7, %ymm7
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm5, %ymm2, %ymm4, %ymm5
 ; AVX2-ONLY-NEXT:    vpshufb %ymm8, %ymm5, %ymm5
@@ -1378,7 +1383,8 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm8, %ymm0, %ymm6, %ymm6
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm8, %ymm1, %ymm0, %ymm9
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm10 = <1,4,7,10,13,0,3,6,9,12,15,u,u,u,u,u,1,4,7,10,13,0,3,6,9,12,15,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0,1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0]
+; AVX2-ONLY-NEXT:    # ymm10 = mem[0,1,0,1]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm10, %ymm9, %ymm9
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm8, %ymm4, %ymm3, %ymm8
 ; AVX2-ONLY-NEXT:    vpshufb %ymm10, %ymm8, %ymm8
@@ -1393,7 +1399,7 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpshufb %ymm11, %ymm1, %ymm1
 ; AVX2-ONLY-NEXT:    vpor %ymm1, %ymm4, %ymm1
 ; AVX2-ONLY-NEXT:    vpalignr {{.*#+}} ymm4 = ymm5[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,2,5,8,11,14,u,u,u,u,u,u,u,u,u,u,u,2,5,8,11,14,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
 ; AVX2-ONLY-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255]
 ; AVX2-ONLY-NEXT:    # ymm6 = mem[0,1,0,1]
@@ -1424,7 +1430,8 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-NEXT:    vinserti128 $1, 64(%rdi), %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti128 $1, 176(%rdi), %ymm5, %ymm5
 ; AVX512F-NEXT:    vinserti128 $1, 80(%rdi), %ymm2, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX512F-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpshufb %ymm6, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpshufb %ymm6, %ymm1, %ymm1
@@ -1456,41 +1463,153 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
-; AVX512BW-LABEL: load_i8_stride3_vf64:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT:    vmovdqa 96(%rdi), %xmm3
-; AVX512BW-NEXT:    vmovdqa 112(%rdi), %xmm4
-; AVX512BW-NEXT:    vmovdqa 128(%rdi), %xmm5
-; AVX512BW-NEXT:    vinserti128 $1, 144(%rdi), %ymm3, %ymm3
-; AVX512BW-NEXT:    vinserti128 $1, 48(%rdi), %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512BW-NEXT:    vinserti128 $1, 160(%rdi), %ymm4, %ymm3
-; AVX512BW-NEXT:    vinserti128 $1, 64(%rdi), %ymm1, %ymm1
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512BW-NEXT:    vinserti128 $1, 176(%rdi), %ymm5, %ymm3
-; AVX512BW-NEXT:    vinserti128 $1, 80(%rdi), %ymm2, %ymm2
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
-; AVX512BW-NEXT:    vpshufb %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpshufb %zmm3, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpshufb %zmm3, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58]
-; AVX512BW-NEXT:    vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
-; AVX512BW-NEXT:    vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
-; AVX512BW-NEXT:    movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800
-; AVX512BW-NEXT:    kmovq %rax, %k1
-; AVX512BW-NEXT:    vpblendmb %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
-; AVX512BW-NEXT:    vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm3[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm3[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm3[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm3[48,49,50,51,52,53,54,55,56,57,58]
-; AVX512BW-NEXT:    vpalignr {{.*#+}} zmm0 = zmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,42,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,58,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57]
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, (%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rcx)
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX512BW-ONLY-SLOW-LABEL: load_i8_stride3_vf64:
+; AVX512BW-ONLY-SLOW:       # %bb.0:
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 96(%rdi), %xmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 112(%rdi), %xmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rdi), %xmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, 144(%rdi), %ymm3, %ymm3
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, 48(%rdi), %ymm0, %ymm0
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, 160(%rdi), %ymm4, %ymm3
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, 64(%rdi), %ymm1, %ymm1
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, 176(%rdi), %ymm5, %ymm3
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, 80(%rdi), %ymm2, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %zmm3, %zmm0, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %zmm3, %zmm1, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %zmm3, %zmm2, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512BW-ONLY-SLOW-NEXT:    vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512BW-ONLY-SLOW-NEXT:    vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512BW-ONLY-SLOW-NEXT:    movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800
+; AVX512BW-ONLY-SLOW-NEXT:    kmovq %rax, %k1
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendmb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512BW-ONLY-SLOW-NEXT:    vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm3[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm3[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm3[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm3[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512BW-ONLY-SLOW-NEXT:    vpalignr {{.*#+}} zmm0 = zmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,42,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,58,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, (%rsi)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, (%rdx)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, (%rcx)
+; AVX512BW-ONLY-SLOW-NEXT:    vzeroupper
+; AVX512BW-ONLY-SLOW-NEXT:    retq
+;
+; AVX512BW-ONLY-FAST-LABEL: load_i8_stride3_vf64:
+; AVX512BW-ONLY-FAST:       # %bb.0:
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 96(%rdi), %xmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 112(%rdi), %xmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rdi), %xmm5
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, 144(%rdi), %ymm3, %ymm3
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, 48(%rdi), %ymm0, %ymm0
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, 160(%rdi), %ymm4, %ymm3
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, 64(%rdi), %ymm1, %ymm1
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, 176(%rdi), %ymm5, %ymm3
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, 80(%rdi), %ymm2, %ymm2
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpshufb %zmm3, %zmm0, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpshufb %zmm3, %zmm1, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpshufb %zmm3, %zmm2, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512BW-ONLY-FAST-NEXT:    vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512BW-ONLY-FAST-NEXT:    vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512BW-ONLY-FAST-NEXT:    movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800
+; AVX512BW-ONLY-FAST-NEXT:    kmovq %rax, %k1
+; AVX512BW-ONLY-FAST-NEXT:    vpblendmb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512BW-ONLY-FAST-NEXT:    vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm3[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm3[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm3[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm3[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512BW-ONLY-FAST-NEXT:    vpalignr {{.*#+}} zmm0 = zmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,42,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,58,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, (%rsi)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, (%rdx)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, (%rcx)
+; AVX512BW-ONLY-FAST-NEXT:    vzeroupper
+; AVX512BW-ONLY-FAST-NEXT:    retq
+;
+; AVX512DQBW-SLOW-LABEL: load_i8_stride3_vf64:
+; AVX512DQBW-SLOW:       # %bb.0:
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 96(%rdi), %xmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 112(%rdi), %xmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rdi), %xmm5
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, 144(%rdi), %ymm3, %ymm3
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, 48(%rdi), %ymm0, %ymm0
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, 160(%rdi), %ymm4, %ymm3
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, 64(%rdi), %ymm1, %ymm1
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, 176(%rdi), %ymm5, %ymm3
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, 80(%rdi), %ymm2, %ymm2
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX512DQBW-SLOW-NEXT:    # zmm3 = mem[0,1,0,1,0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vpshufb %zmm3, %zmm0, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpshufb %zmm3, %zmm1, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpshufb %zmm3, %zmm2, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512DQBW-SLOW-NEXT:    vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512DQBW-SLOW-NEXT:    vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512DQBW-SLOW-NEXT:    movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800
+; AVX512DQBW-SLOW-NEXT:    kmovq %rax, %k1
+; AVX512DQBW-SLOW-NEXT:    vpblendmb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512DQBW-SLOW-NEXT:    vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm3[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm3[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm3[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm3[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512DQBW-SLOW-NEXT:    vpalignr {{.*#+}} zmm0 = zmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,42,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,58,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rsi)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, (%rdx)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, (%rcx)
+; AVX512DQBW-SLOW-NEXT:    vzeroupper
+; AVX512DQBW-SLOW-NEXT:    retq
+;
+; AVX512DQBW-FAST-LABEL: load_i8_stride3_vf64:
+; AVX512DQBW-FAST:       # %bb.0:
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa 96(%rdi), %xmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa 112(%rdi), %xmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rdi), %xmm5
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, 144(%rdi), %ymm3, %ymm3
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, 48(%rdi), %ymm0, %ymm0
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, 160(%rdi), %ymm4, %ymm3
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, 64(%rdi), %ymm1, %ymm1
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, 176(%rdi), %ymm5, %ymm3
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, 80(%rdi), %ymm2, %ymm2
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX512DQBW-FAST-NEXT:    # zmm3 = mem[0,1,0,1,0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vpshufb %zmm3, %zmm0, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpshufb %zmm3, %zmm1, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpshufb %zmm3, %zmm2, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512DQBW-FAST-NEXT:    vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512DQBW-FAST-NEXT:    vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512DQBW-FAST-NEXT:    movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800
+; AVX512DQBW-FAST-NEXT:    kmovq %rax, %k1
+; AVX512DQBW-FAST-NEXT:    vpblendmb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512DQBW-FAST-NEXT:    vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm3[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm3[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm3[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm3[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512DQBW-FAST-NEXT:    vpalignr {{.*#+}} zmm0 = zmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,42,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,58,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, (%rsi)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, (%rdx)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, (%rcx)
+; AVX512DQBW-FAST-NEXT:    vzeroupper
+; AVX512DQBW-FAST-NEXT:    retq
   %wide.vec = load <192 x i8>, ptr %in.vec, align 64
   %strided.vec0 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <64 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45, i32 48, i32 51, i32 54, i32 57, i32 60, i32 63, i32 66, i32 69, i32 72, i32 75, i32 78, i32 81, i32 84, i32 87, i32 90, i32 93, i32 96, i32 99, i32 102, i32 105, i32 108, i32 111, i32 114, i32 117, i32 120, i32 123, i32 126, i32 129, i32 132, i32 135, i32 138, i32 141, i32 144, i32 147, i32 150, i32 153, i32 156, i32 159, i32 162, i32 165, i32 168, i32 171, i32 174, i32 177, i32 180, i32 183, i32 186, i32 189>
   %strided.vec1 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <64 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46, i32 49, i32 52, i32 55, i32 58, i32 61, i32 64, i32 67, i32 70, i32 73, i32 76, i32 79, i32 82, i32 85, i32 88, i32 91, i32 94, i32 97, i32 100, i32 103, i32 106, i32 109, i32 112, i32 115, i32 118, i32 121, i32 124, i32 127, i32 130, i32 133, i32 136, i32 139, i32 142, i32 145, i32 148, i32 151, i32 154, i32 157, i32 160, i32 163, i32 166, i32 169, i32 172, i32 175, i32 178, i32 181, i32 184, i32 187, i32 190>
@@ -1507,13 +1626,9 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW: {{.*}}
 ; AVX512: {{.*}}
 ; AVX512BW-FAST: {{.*}}
-; AVX512BW-ONLY-FAST: {{.*}}
-; AVX512BW-ONLY-SLOW: {{.*}}
 ; AVX512BW-SLOW: {{.*}}
 ; AVX512DQ-FAST: {{.*}}
 ; AVX512DQ-SLOW: {{.*}}
-; AVX512DQBW-FAST: {{.*}}
-; AVX512DQBW-SLOW: {{.*}}
 ; AVX512F-FAST: {{.*}}
 ; AVX512F-ONLY-FAST: {{.*}}
 ; AVX512F-ONLY-SLOW: {{.*}}

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
index 7e9cce1a7e8db..f42d49cbeb73d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
@@ -216,31 +216,57 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movq %xmm2, (%r8)
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: load_i8_stride4_vf8:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX1-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
-; AVX1-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
-; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm5
-; AVX1-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX1-NEXT:    vmovq %xmm0, (%rsi)
-; AVX1-NEXT:    vmovq %xmm3, (%rdx)
-; AVX1-NEXT:    vmovq %xmm4, (%rcx)
-; AVX1-NEXT:    vmovq %xmm1, (%r8)
-; AVX1-NEXT:    retq
+; AVX1-ONLY-LABEL: load_i8_stride4_vf8:
+; AVX1-ONLY:       # %bb.0:
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX1-ONLY-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
+; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
+; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
+; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm2, %xmm5
+; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
+; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
+; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
+; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX1-ONLY-NEXT:    vmovq %xmm0, (%rsi)
+; AVX1-ONLY-NEXT:    vmovq %xmm3, (%rdx)
+; AVX1-ONLY-NEXT:    vmovq %xmm4, (%rcx)
+; AVX1-ONLY-NEXT:    vmovq %xmm1, (%r8)
+; AVX1-ONLY-NEXT:    retq
+;
+; AVX2-ONLY-LABEL: load_i8_stride4_vf8:
+; AVX2-ONLY:       # %bb.0:
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-ONLY-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX2-ONLY-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
+; AVX2-ONLY-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-ONLY-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
+; AVX2-ONLY-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
+; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-ONLY-NEXT:    vpshufb %xmm4, %xmm2, %xmm5
+; AVX2-ONLY-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
+; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-ONLY-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-ONLY-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
+; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX2-ONLY-NEXT:    vmovq %xmm0, (%rsi)
+; AVX2-ONLY-NEXT:    vmovq %xmm3, (%rdx)
+; AVX2-ONLY-NEXT:    vmovq %xmm4, (%rcx)
+; AVX2-ONLY-NEXT:    vmovq %xmm1, (%r8)
+; AVX2-ONLY-NEXT:    retq
 ;
 ; AVX512-LABEL: load_i8_stride4_vf8:
 ; AVX512:       # %bb.0:
@@ -433,38 +459,38 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX2-ONLY-NEXT:    vmovdqa 32(%rdi), %xmm2
 ; AVX2-ONLY-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
 ; AVX2-ONLY-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
 ; AVX2-ONLY-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
 ; AVX2-ONLY-NEXT:    vpshufb %xmm5, %xmm2, %xmm5
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm6, %xmm1, %xmm7
 ; AVX2-ONLY-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm6, %xmm3, %xmm7
 ; AVX2-ONLY-NEXT:    vpshufb %xmm6, %xmm2, %xmm6
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm7, %xmm1, %xmm8
 ; AVX2-ONLY-NEXT:    vpshufb %xmm7, %xmm0, %xmm7
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
 ; AVX2-ONLY-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX2-ONLY-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -844,16 +870,16 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; AVX2-ONLY-NEXT:    vmovdqa 32(%rdi), %xmm4
 ; AVX2-ONLY-NEXT:    vmovdqa 48(%rdi), %xmm5
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm6, %xmm5, %xmm7
 ; AVX2-ONLY-NEXT:    vpshufb %xmm6, %xmm4, %xmm6
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm7, %xmm3, %xmm8
 ; AVX2-ONLY-NEXT:    vpshufb %xmm7, %xmm2, %xmm7
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm8 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm8, %ymm1, %ymm9
 ; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4]
 ; AVX2-ONLY-NEXT:    vpermd %ymm9, %ymm6, %ymm9
@@ -861,48 +887,48 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpermd %ymm8, %ymm6, %ymm8
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm8, %xmm5, %xmm9
 ; AVX2-ONLY-NEXT:    vpshufb %xmm8, %xmm4, %xmm8
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm9 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm9, %xmm3, %xmm10
 ; AVX2-ONLY-NEXT:    vpshufb %xmm9, %xmm2, %xmm9
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm9 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm9, %ymm1, %ymm10
 ; AVX2-ONLY-NEXT:    vpermd %ymm10, %ymm6, %ymm10
 ; AVX2-ONLY-NEXT:    vpshufb %ymm9, %ymm0, %ymm9
 ; AVX2-ONLY-NEXT:    vpermd %ymm9, %ymm6, %ymm9
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm9, %xmm5, %xmm10
 ; AVX2-ONLY-NEXT:    vpshufb %xmm9, %xmm4, %xmm9
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm10 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm10, %xmm3, %xmm11
 ; AVX2-ONLY-NEXT:    vpshufb %xmm10, %xmm2, %xmm10
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm10 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm10, %ymm1, %ymm11
 ; AVX2-ONLY-NEXT:    vpermd %ymm11, %ymm6, %ymm11
 ; AVX2-ONLY-NEXT:    vpshufb %ymm10, %ymm0, %ymm10
 ; AVX2-ONLY-NEXT:    vpermd %ymm10, %ymm6, %ymm10
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm10, %xmm5, %xmm5
 ; AVX2-ONLY-NEXT:    vpshufb %xmm10, %xmm4, %xmm4
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
 ; AVX2-ONLY-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX2-ONLY-NEXT:    vpermd %ymm1, %ymm6, %ymm1
 ; AVX2-ONLY-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
@@ -918,7 +944,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512F-LABEL: load_i8_stride4_vf32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512F-NEXT:    vmovdqa 96(%rdi), %ymm1
 ; AVX512F-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX512F-NEXT:    vmovdqa 64(%rdi), %ymm3
@@ -928,21 +954,21 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; AVX512F-NEXT:    vpmovdb %zmm2, %xmm5
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX512F-NEXT:    vpshufb %ymm5, %ymm1, %ymm6
 ; AVX512F-NEXT:    vpshufb %ymm5, %ymm3, %ymm5
 ; AVX512F-NEXT:    vpermt2d %ymm6, %ymm4, %ymm5
 ; AVX512F-NEXT:    vpsrld $8, %zmm2, %zmm6
 ; AVX512F-NEXT:    vpmovdb %zmm6, %xmm6
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX512F-NEXT:    vpshufb %ymm6, %ymm1, %ymm7
 ; AVX512F-NEXT:    vpshufb %ymm6, %ymm3, %ymm6
 ; AVX512F-NEXT:    vpermt2d %ymm7, %ymm4, %ymm6
 ; AVX512F-NEXT:    vpsrld $16, %zmm2, %zmm7
 ; AVX512F-NEXT:    vpmovdb %zmm7, %xmm7
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX512F-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpshufb %ymm7, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpermt2d %ymm1, %ymm4, %ymm3
@@ -1745,16 +1771,16 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vmovdqa 16(%rdi), %xmm5
 ; AVX2-ONLY-NEXT:    vmovdqa 32(%rdi), %xmm6
 ; AVX2-ONLY-NEXT:    vmovdqa 48(%rdi), %xmm7
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm3, %xmm7, %xmm2
 ; AVX2-ONLY-NEXT:    vpshufb %xmm3, %xmm6, %xmm8
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm10 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm10, %xmm5, %xmm8
 ; AVX2-ONLY-NEXT:    vpshufb %xmm10, %xmm4, %xmm9
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm2[2,3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm13 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm13, %ymm1, %ymm9
 ; AVX2-ONLY-NEXT:    vmovdqa %ymm1, %ymm4
 ; AVX2-ONLY-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -1788,7 +1814,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm3, %xmm7, %xmm13
 ; AVX2-ONLY-NEXT:    vmovdqa %xmm7, %xmm10
 ; AVX2-ONLY-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1796,7 +1822,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vmovdqa %xmm6, %xmm7
 ; AVX2-ONLY-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm15 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm15, %xmm5, %xmm0
 ; AVX2-ONLY-NEXT:    vmovdqa %xmm5, %xmm14
 ; AVX2-ONLY-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1804,7 +1830,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpshufb %xmm15, %xmm5, %xmm1
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm1 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm1, %ymm4, %ymm13
 ; AVX2-ONLY-NEXT:    vpermd %ymm13, %ymm2, %ymm13
 ; AVX2-ONLY-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
@@ -1829,16 +1855,16 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm0, %xmm10, %xmm1
 ; AVX2-ONLY-NEXT:    vpshufb %xmm0, %xmm7, %xmm3
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm3, %xmm14, %xmm4
 ; AVX2-ONLY-NEXT:    vpshufb %xmm3, %xmm5, %xmm13
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm4 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX2-ONLY-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vpshufb %ymm4, %ymm7, %ymm13
 ; AVX2-ONLY-NEXT:    vpermd %ymm13, %ymm2, %ymm13
@@ -1862,20 +1888,20 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpermd %ymm3, %ymm2, %ymm3
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX2-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-ONLY-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
 ; AVX2-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX2-ONLY-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX2-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX2-ONLY-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
 ; AVX2-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX2-ONLY-NEXT:    vpshufb %xmm4, %xmm6, %xmm6
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm5 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm5, %ymm7, %ymm6
 ; AVX2-ONLY-NEXT:    vpermd %ymm6, %ymm2, %ymm6
 ; AVX2-ONLY-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
@@ -1916,7 +1942,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512F-NEXT:    vmovdqa 224(%rdi), %ymm3
 ; AVX512F-NEXT:    vpshufb %ymm7, %ymm3, %ymm5
 ; AVX512F-NEXT:    vmovdqa 192(%rdi), %ymm4
@@ -1934,7 +1960,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm9
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
 ; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm8[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX512F-NEXT:    vpshufb %ymm8, %ymm3, %ymm9
 ; AVX512F-NEXT:    vpshufb %ymm8, %ymm4, %ymm10
 ; AVX512F-NEXT:    vpermt2d %ymm9, %ymm1, %ymm10
@@ -1949,7 +1975,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-NEXT:    vpmovdb %zmm10, %xmm10
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm9[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm9 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX512F-NEXT:    vpshufb %ymm9, %ymm3, %ymm10
 ; AVX512F-NEXT:    vpshufb %ymm9, %ymm4, %ymm11
 ; AVX512F-NEXT:    vpermt2d %ymm10, %ymm1, %ymm11
@@ -1964,7 +1990,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-NEXT:    vpmovdb %zmm11, %xmm11
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
 ; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm10[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm10 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX512F-NEXT:    vpshufb %ymm10, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpshufb %ymm10, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpermt2d %ymm3, %ymm1, %ymm4

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
index 55d943b52659d..5e0f383246c6f 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
@@ -150,26 +150,47 @@ define void @load_i8_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movd %xmm3, (%r9)
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: load_i8_stride5_vf4:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm3
-; AVX-NEXT:    vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
-; AVX-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
-; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
-; AVX-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
-; AVX-NEXT:    vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
-; AVX-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
-; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
-; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vmovd %xmm3, (%rsi)
-; AVX-NEXT:    vmovd %xmm4, (%rdx)
-; AVX-NEXT:    vmovd %xmm5, (%rcx)
-; AVX-NEXT:    vmovd %xmm6, (%r8)
-; AVX-NEXT:    vmovd %xmm0, (%r9)
-; AVX-NEXT:    retq
+; AVX1-ONLY-LABEL: load_i8_stride5_vf4:
+; AVX1-ONLY:       # %bb.0:
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX1-ONLY-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm1, %xmm3
+; AVX1-ONLY-NEXT:    vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
+; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
+; AVX1-ONLY-NEXT:    vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
+; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
+; AVX1-ONLY-NEXT:    vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
+; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
+; AVX1-ONLY-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
+; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX1-ONLY-NEXT:    vmovd %xmm3, (%rsi)
+; AVX1-ONLY-NEXT:    vmovd %xmm4, (%rdx)
+; AVX1-ONLY-NEXT:    vmovd %xmm5, (%rcx)
+; AVX1-ONLY-NEXT:    vmovd %xmm6, (%r8)
+; AVX1-ONLY-NEXT:    vmovd %xmm0, (%r9)
+; AVX1-ONLY-NEXT:    retq
+;
+; AVX2-LABEL: load_i8_stride5_vf4:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15]
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm3
+; AVX2-NEXT:    vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
+; AVX2-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
+; AVX2-NEXT:    vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
+; AVX2-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
+; AVX2-NEXT:    vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
+; AVX2-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
+; AVX2-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
+; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovd %xmm3, (%rsi)
+; AVX2-NEXT:    vmovd %xmm4, (%rdx)
+; AVX2-NEXT:    vmovd %xmm5, (%rcx)
+; AVX2-NEXT:    vmovd %xmm6, (%r8)
+; AVX2-NEXT:    vmovd %xmm0, (%r9)
+; AVX2-NEXT:    retq
   %wide.vec = load <20 x i8>, ptr %in.vec, align 64
   %strided.vec0 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 0, i32 5, i32 10, i32 15>
   %strided.vec1 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 1, i32 6, i32 11, i32 16>
@@ -1661,7 +1682,8 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255>
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm9, %ymm1, %ymm2, %ymm7
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,255,u,u,u,u,255,u,u,u,u,255,u,u,u,u,255,u,u,255,0,255,u,u,255,0,255,u,u,255,0,255>
+; AVX2-ONLY-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255]
+; AVX2-ONLY-NEXT:    # ymm10 = mem[0,1,0,1]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm10, %ymm7, %ymm8, %ymm7
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0]
@@ -1682,7 +1704,8 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm11 = <255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255>
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm11, %ymm2, %ymm1, %ymm12
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,255,u,u,u,u,255,u,u,u,u,255,u,u,u,0,255,u,u,255,0,255,u,u,255,0,255,u,u,255,0>
+; AVX2-ONLY-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0]
+; AVX2-ONLY-NEXT:    # ymm14 = mem[0,1,0,1]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm14, %ymm12, %ymm13, %ymm12
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm10, %ymm9, %ymm12, %ymm9
@@ -1700,7 +1723,8 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm12 = <255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255>
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm12, %ymm2, %ymm1, %ymm13
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,u,255,u,u,u,u,255,u,u,u,u,255,u,u,255,0,255,u,u,255,0,255,u,u,255,0,255,u,u,255>
+; AVX2-ONLY-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255]
+; AVX2-ONLY-NEXT:    # ymm15 = mem[0,1,0,1]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm15, %ymm13, %ymm14, %ymm13
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm10, %ymm11, %ymm13, %ymm10
@@ -1718,7 +1742,8 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u>
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm12, %ymm2, %ymm1, %ymm13
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,u,u,255,u,u,u,u,255,u,u,u,u,255,u,u,255,0,255,u,u,255,0,255,u,u,255,0,255,u,u>
+; AVX2-ONLY-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0]
+; AVX2-ONLY-NEXT:    # ymm15 = mem[0,1,0,1]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm15, %ymm13, %ymm14, %ymm13
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm14 = <255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u>
@@ -1736,7 +1761,8 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpor %xmm5, %xmm4, %xmm4
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,u,u,u,u,255,u,u,u,u,255,u,u,u,u,255,u,u,255,0,255,u,u,255,0,255,u,u,255,0,255,u>
+; AVX2-ONLY-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255]
+; AVX2-ONLY-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7]
@@ -3576,7 +3602,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = <0,5,10,15,128,128,128,3,8,13,128,128,128,u,u,u>
 ; AVX2-ONLY-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
 ; AVX2-ONLY-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,3,8,13,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11,0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11]
+; AVX2-ONLY-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm1, %ymm12, %ymm5
 ; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm3, %ymm0, %ymm5, %ymm12
@@ -3596,7 +3623,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = <128,128,128,0,5,10,15,128,128,128,3,8,13,u,u,u>
 ; AVX2-ONLY-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
 ; AVX2-ONLY-NEXT:    vpor %xmm7, %xmm5, %xmm5
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,4,9,14,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12,1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12]
+; AVX2-ONLY-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm7, %ymm13, %ymm8
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm3, %ymm5, %ymm8, %ymm13
 ; AVX2-ONLY-NEXT:    vpshufb %ymm7, %ymm10, %ymm5
@@ -3614,7 +3642,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = <2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u>
 ; AVX2-ONLY-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
 ; AVX2-ONLY-NEXT:    vpor %xmm5, %xmm0, %xmm0
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,0,5,10,15,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13,2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13]
+; AVX2-ONLY-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX2-ONLY-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vpshufb %ymm5, %ymm14, %ymm14
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm3, %ymm0, %ymm14, %ymm0
@@ -3636,7 +3665,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm14 = <128,128,128,2,7,12,128,128,128,0,5,10,15,u,u,u>
 ; AVX2-ONLY-NEXT:    vpshufb %xmm14, %xmm0, %xmm0
 ; AVX2-ONLY-NEXT:    vpor %xmm7, %xmm0, %xmm0
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,1,6,11,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14,3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14]
+; AVX2-ONLY-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX2-ONLY-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vpshufb %ymm7, %ymm6, %ymm6
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm3, %ymm0, %ymm6, %ymm0
@@ -3661,13 +3691,13 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm11, %ymm5, %ymm3, %ymm0
 ; AVX2-ONLY-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,u,u,u,u,u,128,128,128,1,6,11>
+; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11]
 ; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u>
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm0, %ymm4, %ymm15, %ymm14
 ; AVX2-ONLY-NEXT:    vmovdqa 304(%rdi), %xmm11
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm15
 ; AVX2-ONLY-NEXT:    vpshufb %xmm5, %xmm11, %xmm2
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,u,u,u,u,2,7,12,128,128,128>
+; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128]
 ; AVX2-ONLY-NEXT:    vmovdqa 288(%rdi), %xmm1
 ; AVX2-ONLY-NEXT:    vpshufb %xmm4, %xmm1, %xmm3
 ; AVX2-ONLY-NEXT:    vpor %xmm2, %xmm3, %xmm2
@@ -3684,9 +3714,9 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpblendw {{.*#+}} ymm4 = ymm9[0,1,2,3,4],ymm4[5,6,7],ymm9[8,9,10,11,12],ymm4[13,14,15]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm4[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,u,u,u,u,128,128,128,2,7,12>
+; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm9, %xmm11, %xmm12
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,u,u,u,u,3,8,13,128,128,128>
+; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm0, %xmm1, %xmm7
 ; AVX2-ONLY-NEXT:    vpor %xmm7, %xmm12, %xmm7
 ; AVX2-ONLY-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
@@ -3699,9 +3729,9 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vpblendw {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7],ymm10[8,9,10,11,12],ymm0[13,14,15]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,u,u,u,128,128,128,128,4,9,14>
+; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [0,128,128,128,128,4,9,14,0,128,128,128,128,4,9,14]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm8, %xmm11, %xmm9
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,u,u,u,0,5,10,15,128,128,128>
+; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [0,0,5,10,15,128,128,128,0,0,5,10,15,128,128,128]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm10, %xmm1, %xmm12
 ; AVX2-ONLY-NEXT:    vpor %xmm9, %xmm12, %xmm9
 ; AVX2-ONLY-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
@@ -3718,11 +3748,12 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = <4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u>
 ; AVX2-ONLY-NEXT:    vpshufb %xmm7, %xmm14, %xmm14
 ; AVX2-ONLY-NEXT:    vpor %xmm10, %xmm14, %xmm10
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,2,7,12,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15,4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15]
+; AVX2-ONLY-NEXT:    # ymm14 = mem[0,1,0,1]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm14, %ymm6, %ymm6
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovdqa 288(%rdi), %ymm10
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,1,6,11,0,5,10,15,u,u,u,u,u,u,u,u,u,1,6,11,0,5,10,15,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm2, %ymm10, %ymm10
 ; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5]
 ; AVX2-ONLY-NEXT:    vpermd %ymm10, %ymm0, %ymm10
@@ -3738,9 +3769,9 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpshufb %ymm2, %ymm7, %ymm2
 ; AVX2-ONLY-NEXT:    vpermd %ymm2, %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm12, %ymm4, %ymm0, %ymm0
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,u,u,u,u,u,128,128,128,3,8,13>
+; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [0,0,128,128,128,3,8,13,0,0,128,128,128,3,8,13]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm2, %xmm11, %xmm4
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,u,u,u,u,u,4,9,14,128,128,128>
+; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [0,0,4,9,14,128,128,128,0,0,4,9,14,128,128,128]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
 ; AVX2-ONLY-NEXT:    vpor %xmm4, %xmm1, %xmm1
 ; AVX2-ONLY-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
index 95800b30987ae..b15879a427b31 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
@@ -4226,7 +4226,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = <128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u>
 ; AVX2-ONLY-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
 ; AVX2-ONLY-NEXT:    vpor %xmm1, %xmm2, %xmm1
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,1,7,13,3,9,15,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm2, %ymm4, %ymm4
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm13, %ymm1, %ymm4, %ymm1
 ; AVX2-ONLY-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -4244,7 +4244,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = <2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u>
 ; AVX2-ONLY-NEXT:    vpshufb %xmm5, %xmm0, %xmm3
 ; AVX2-ONLY-NEXT:    vpor %xmm2, %xmm3, %xmm3
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,2,8,14,4,10,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12]
 ; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm4 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0>
 ; AVX2-ONLY-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm4, %ymm11, %ymm12, %ymm2
@@ -4266,7 +4266,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = <3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u>
 ; AVX2-ONLY-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
 ; AVX2-ONLY-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,3,9,15,5,11,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm13, %ymm0, %ymm2, %ymm0
 ; AVX2-ONLY-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -4343,7 +4343,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = <4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u>
 ; AVX2-ONLY-NEXT:    vpshufb %xmm4, %xmm5, %xmm14
 ; AVX2-ONLY-NEXT:    vpor %xmm0, %xmm14, %xmm1
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,4,10,0,6,12,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm14, %ymm13, %ymm0
 ; AVX2-ONLY-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
@@ -4374,7 +4374,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm14 = <5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u>
 ; AVX2-ONLY-NEXT:    vpshufb %xmm14, %xmm5, %xmm5
 ; AVX2-ONLY-NEXT:    vpor %xmm6, %xmm5, %xmm5
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,5,11,1,7,13,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm6, %ymm13, %ymm11
 ; AVX2-ONLY-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm11[5,6,7]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7]
@@ -4752,7 +4752,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vextracti32x4 $1, %ymm20, %xmm21
 ; AVX512BW-NEXT:    vpshufb %xmm4, %xmm21, %xmm4
 ; AVX512BW-NEXT:    vpor %xmm2, %xmm4, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,0,6,12,2,8,14,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10]
 ; AVX512BW-NEXT:    movl $4192256, %r10d # imm = 0x3FF800
 ; AVX512BW-NEXT:    kmovd %r10d, %k2
 ; AVX512BW-NEXT:    vpshufb %ymm6, %ymm19, %ymm2 {%k2}
@@ -4791,7 +4791,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vpshufb %xmm7, %xmm20, %xmm7
 ; AVX512BW-NEXT:    vpshufb %xmm9, %xmm21, %xmm9
 ; AVX512BW-NEXT:    vpor %xmm7, %xmm9, %xmm9
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,1,7,13,3,9,15,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11]
 ; AVX512BW-NEXT:    vpshufb %ymm7, %ymm19, %ymm9 {%k2}
 ; AVX512BW-NEXT:    vmovdqu16 %zmm8, %zmm9 {%k2}
 ; AVX512BW-NEXT:    vpshufb %ymm7, %ymm22, %ymm7
@@ -4813,7 +4813,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = <2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u>
 ; AVX512BW-NEXT:    vpshufb %xmm17, %xmm8, %xmm18
 ; AVX512BW-NEXT:    vporq %xmm12, %xmm18, %xmm18
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm19 = <u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,2,8,14,4,10,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm19 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12]
 ; AVX512BW-NEXT:    movl $2095104, %edi # imm = 0x1FF800
 ; AVX512BW-NEXT:    kmovd %edi, %k5
 ; AVX512BW-NEXT:    vpshufb %ymm19, %ymm15, %ymm18 {%k5}
@@ -4853,7 +4853,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm16 = <3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u>
 ; AVX512BW-NEXT:    vpshufb %xmm16, %xmm8, %xmm8
 ; AVX512BW-NEXT:    vpor %xmm14, %xmm8, %xmm8
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,3,9,15,5,11,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13]
 ; AVX512BW-NEXT:    vpshufb %ymm14, %ymm15, %ymm8 {%k5}
 ; AVX512BW-NEXT:    vpshufb %xmm13, %xmm21, %xmm13
 ; AVX512BW-NEXT:    vpshufb %xmm16, %xmm20, %xmm15
@@ -4873,7 +4873,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vpshufb %ymm14, %ymm7, %ymm8 {%k5}
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm7
 ; AVX512BW-NEXT:    vmovdqu8 %zmm7, %zmm13 {%k3}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,4,10,0,6,12,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14]
 ; AVX512BW-NEXT:    vpshufb %ymm7, %ymm5, %ymm8
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm14 = <128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u>
 ; AVX512BW-NEXT:    vmovdqu16 %ymm10, %ymm3 {%k1}
@@ -4913,7 +4913,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vmovdqu8 %ymm14, %ymm7 {%k2}
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm7
 ; AVX512BW-NEXT:    vmovdqu16 %zmm7, %zmm10 {%k2}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,5,11,1,7,13,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15]
 ; AVX512BW-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm14 = <128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u>
 ; AVX512BW-NEXT:    vpshufb %xmm14, %xmm15, %xmm15

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
index 6a959b11bf142..d12a7d797c64e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
@@ -213,44 +213,161 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movd %xmm0, (%rax)
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: load_i8_stride7_vf4:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm3
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = <2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
-; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm6 = <3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vpshufb %xmm6, %xmm0, %xmm7
-; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm8 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vpshufb %xmm8, %xmm0, %xmm9
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
-; AVX-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
-; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
-; AVX-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT:    vmovd %xmm2, (%rsi)
-; AVX-NEXT:    vmovd %xmm3, (%rdx)
-; AVX-NEXT:    vmovd %xmm5, (%rcx)
-; AVX-NEXT:    vmovd %xmm7, (%r8)
-; AVX-NEXT:    vmovd %xmm4, (%r9)
-; AVX-NEXT:    vmovd %xmm6, (%r10)
-; AVX-NEXT:    vmovd %xmm0, (%rax)
-; AVX-NEXT:    retq
+; AVX1-ONLY-LABEL: load_i8_stride7_vf4:
+; AVX1-ONLY:       # %bb.0:
+; AVX1-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX1-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX1-ONLY-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-ONLY-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = <2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
+; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = <3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vpshufb %xmm6, %xmm0, %xmm7
+; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm8 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vpshufb %xmm8, %xmm0, %xmm9
+; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
+; AVX1-ONLY-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
+; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; AVX1-ONLY-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
+; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-ONLY-NEXT:    vmovd %xmm2, (%rsi)
+; AVX1-ONLY-NEXT:    vmovd %xmm3, (%rdx)
+; AVX1-ONLY-NEXT:    vmovd %xmm5, (%rcx)
+; AVX1-ONLY-NEXT:    vmovd %xmm7, (%r8)
+; AVX1-ONLY-NEXT:    vmovd %xmm4, (%r9)
+; AVX1-ONLY-NEXT:    vmovd %xmm6, (%r10)
+; AVX1-ONLY-NEXT:    vmovd %xmm0, (%rax)
+; AVX1-ONLY-NEXT:    retq
+;
+; AVX2-ONLY-LABEL: load_i8_stride7_vf4:
+; AVX2-ONLY:       # %bb.0:
+; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX2-ONLY-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-ONLY-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-ONLY-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
+; AVX2-ONLY-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
+; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
+; AVX2-ONLY-NEXT:    vpshufb %xmm6, %xmm0, %xmm7
+; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX2-ONLY-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
+; AVX2-ONLY-NEXT:    vpshufb %xmm8, %xmm0, %xmm9
+; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
+; AVX2-ONLY-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
+; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; AVX2-ONLY-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
+; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-ONLY-NEXT:    vmovd %xmm2, (%rsi)
+; AVX2-ONLY-NEXT:    vmovd %xmm3, (%rdx)
+; AVX2-ONLY-NEXT:    vmovd %xmm5, (%rcx)
+; AVX2-ONLY-NEXT:    vmovd %xmm7, (%r8)
+; AVX2-ONLY-NEXT:    vmovd %xmm4, (%r9)
+; AVX2-ONLY-NEXT:    vmovd %xmm6, (%r10)
+; AVX2-ONLY-NEXT:    vmovd %xmm0, (%rax)
+; AVX2-ONLY-NEXT:    retq
+;
+; AVX512F-LABEL: load_i8_stride7_vf4:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0]
+; AVX512F-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0]
+; AVX512F-NEXT:    vpshufb %xmm6, %xmm0, %xmm7
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX512F-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX512F-NEXT:    vpshufb %xmm8, %xmm0, %xmm9
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
+; AVX512F-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; AVX512F-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT:    vmovd %xmm2, (%rsi)
+; AVX512F-NEXT:    vmovd %xmm3, (%rdx)
+; AVX512F-NEXT:    vmovd %xmm5, (%rcx)
+; AVX512F-NEXT:    vmovd %xmm7, (%r8)
+; AVX512F-NEXT:    vmovd %xmm4, (%r9)
+; AVX512F-NEXT:    vmovd %xmm6, (%r10)
+; AVX512F-NEXT:    vmovd %xmm0, (%rax)
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: load_i8_stride7_vf4:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
+; AVX512BW-NEXT:    vpshufb %xmm6, %xmm0, %xmm7
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX512BW-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm0, %xmm9
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
+; AVX512BW-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT:    vmovd %xmm2, (%rsi)
+; AVX512BW-NEXT:    vmovd %xmm3, (%rdx)
+; AVX512BW-NEXT:    vmovd %xmm5, (%rcx)
+; AVX512BW-NEXT:    vmovd %xmm7, (%r8)
+; AVX512BW-NEXT:    vmovd %xmm4, (%r9)
+; AVX512BW-NEXT:    vmovd %xmm6, (%r10)
+; AVX512BW-NEXT:    vmovd %xmm0, (%rax)
+; AVX512BW-NEXT:    retq
   %wide.vec = load <28 x i8>, ptr %in.vec, align 64
   %strided.vec0 = shufflevector <28 x i8> %wide.vec, <28 x i8> poison, <4 x i32> <i32 0, i32 7, i32 14, i32 21>
   %strided.vec1 = shufflevector <28 x i8> %wide.vec, <28 x i8> poison, <4 x i32> <i32 1, i32 8, i32 15, i32 22>
@@ -3311,7 +3428,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
 ; AVX2-FAST-NEXT:    vpor %xmm1, %xmm8, %xmm1
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,1,2,4,6>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [1,2,4,6,1,2,4,6]
+; AVX2-FAST-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm9, %ymm8, %ymm8
 ; AVX2-FAST-NEXT:    vmovdqa %ymm9, %ymm11
 ; AVX2-FAST-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -3339,7 +3457,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpor %xmm1, %xmm9, %xmm1
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,1,3,4,6>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [1,3,4,6,1,3,4,6]
+; AVX2-FAST-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm11, %ymm9, %ymm9
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm9[7]
@@ -3457,7 +3576,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm6 = xmm8[u,u,0,7,14],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[u,u,u,u]
 ; AVX2-FAST-NEXT:    vpor %xmm2, %xmm6, %xmm2
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,1,3,5,6>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [1,3,5,6,1,3,5,6]
+; AVX2-FAST-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
@@ -3718,7 +3838,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vpor %xmm4, %xmm1, %xmm1
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512F-SLOW-NEXT:    vmovdqa 192(%rdi), %xmm4
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm5, %xmm4, %xmm6
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm5, %xmm20
 ; AVX512F-SLOW-NEXT:    vmovdqa 208(%rdi), %xmm5
@@ -3923,7 +4043,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpor %xmm4, %xmm1, %xmm1
 ; AVX512F-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,1,2,4,6>
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [1,2,4,6,1,2,4,6]
+; AVX512F-FAST-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vmovdqa 192(%rdi), %ymm12
 ; AVX512F-FAST-NEXT:    vpermd %ymm12, %ymm4, %ymm4
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
@@ -3956,7 +4077,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpor %xmm6, %xmm10, %xmm6
 ; AVX512F-FAST-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,1,3,4,6>
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [1,3,4,6,1,3,4,6]
+; AVX512F-FAST-NEXT:    # ymm10 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vpermd %ymm12, %ymm10, %ymm10
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7]
@@ -3991,7 +4113,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,7,14],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpor %xmm6, %xmm13, %xmm6
 ; AVX512F-FAST-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,1,3,5,6>
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [1,3,5,6,1,3,5,6]
+; AVX512F-FAST-NEXT:    # ymm13 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vpermd %ymm12, %ymm13, %ymm12
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7]
@@ -4153,7 +4276,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-SLOW-NEXT:    vpor %xmm9, %xmm8, %xmm8
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm13
 ; AVX512BW-SLOW-NEXT:    vmovdqa 192(%rdi), %xmm8
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm14, %xmm8, %xmm15
 ; AVX512BW-SLOW-NEXT:    vmovdqa 208(%rdi), %xmm9
 ; AVX512BW-SLOW-NEXT:    vpshufb {{.*#+}} xmm16 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
@@ -4339,7 +4462,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u]
 ; AVX512BW-FAST-NEXT:    vpor %xmm12, %xmm11, %xmm11
 ; AVX512BW-FAST-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,u,u,1,2,4,6>
+; AVX512BW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [1,2,4,6,1,2,4,6]
+; AVX512BW-FAST-NEXT:    # ymm12 = mem[0,1,0,1]
 ; AVX512BW-FAST-NEXT:    vmovdqa 192(%rdi), %ymm13
 ; AVX512BW-FAST-NEXT:    vpermd %ymm13, %ymm12, %ymm12
 ; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
@@ -4366,7 +4490,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,xmm11[4,11],zero,zero,xmm11[0,7,14,u,u,u,u]
 ; AVX512BW-FAST-NEXT:    vpor %xmm12, %xmm11, %xmm11
 ; AVX512BW-FAST-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,u,u,1,3,4,6>
+; AVX512BW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [1,3,4,6,1,3,4,6]
+; AVX512BW-FAST-NEXT:    # ymm12 = mem[0,1,0,1]
 ; AVX512BW-FAST-NEXT:    vpermd %ymm13, %ymm12, %ymm12
 ; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
@@ -4385,7 +4510,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[u,u,u,u]
 ; AVX512BW-FAST-NEXT:    vpor %xmm12, %xmm10, %xmm10
 ; AVX512BW-FAST-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,u,u,1,3,5,6>
+; AVX512BW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [1,3,5,6,1,3,5,6]
+; AVX512BW-FAST-NEXT:    # ymm12 = mem[0,1,0,1]
 ; AVX512BW-FAST-NEXT:    vpermd %ymm13, %ymm12, %ymm12
 ; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7]
@@ -7377,7 +7503,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm3, %xmm4
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,0,7,14,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9]
+; AVX2-SLOW-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm4, %ymm5
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm4, %ymm0, %ymm5, %ymm0
@@ -7413,7 +7540,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm10 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u>
 ; AVX2-SLOW-NEXT:    vpshufb %xmm10, %xmm4, %xmm4
 ; AVX2-SLOW-NEXT:    vpor %xmm5, %xmm4, %xmm4
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10]
+; AVX2-SLOW-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm9, %ymm2, %ymm4, %ymm2
@@ -7442,11 +7570,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,u,5,12,128,128,1,8,15,128,128,u,u,u,u>
 ; AVX2-SLOW-NEXT:    vpshufb %xmm12, %xmm4, %xmm4
 ; AVX2-SLOW-NEXT:    vpor %xmm5, %xmm4, %xmm4
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
 ; AVX2-SLOW-NEXT:    vmovdqa 432(%rdi), %xmm1
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm1, %xmm5
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
 ; AVX2-SLOW-NEXT:    vmovdqa 416(%rdi), %xmm3
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm3, %xmm14
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -7484,9 +7612,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,u,128,128,4,11,128,128,0,7,14,u,u,u,u>
 ; AVX2-SLOW-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vpor %xmm0, %xmm9, %xmm0
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm9 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm9, %xmm1, %xmm13
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm15 = <u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm15 = [5,12,5,12,5,12,5,12,5,12,5,12,5,12,5,12]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm15, %xmm3, %xmm8
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3]
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
@@ -7518,10 +7646,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,1,8,15,128,128,4,11,128,128,u,u,u,u,u>
 ; AVX2-SLOW-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,u,u,u,u,u,128,128,128,5,12>
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm1, %xmm15, %xmm8
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,u,u,u,u,u,0,7,14,128,128>
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm9, %xmm14, %xmm12
 ; AVX2-SLOW-NEXT:    vpor %xmm8, %xmm12, %xmm8
@@ -7553,9 +7681,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,128,128,0,7,14,128,128,3,10,u,u,u,u,u>
 ; AVX2-SLOW-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
 ; AVX2-SLOW-NEXT:    vpor %xmm6, %xmm1, %xmm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,u,u,u,128,128,128,6,13>
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm6, %xmm15, %xmm9
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,u,u,u,u,u,u,u,u,u,1,8,15,128,128>
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm12, %xmm14, %xmm13
 ; AVX2-SLOW-NEXT:    vpor %xmm9, %xmm13, %xmm9
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
@@ -7585,10 +7713,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,128,128,1,8,15,128,128,4,11,u,u,u,u,u>
 ; AVX2-SLOW-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
 ; AVX2-SLOW-NEXT:    vpor %xmm6, %xmm1, %xmm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,u,u,u,2,9,128,128,128>
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128]
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm14, %xmm9
 ; AVX2-SLOW-NEXT:    vpshufb %xmm6, %xmm14, %xmm12
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm13 = <u,u,u,u,u,u,u,u,u,u,u,128,128,0,7,14>
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm13 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm13, %xmm10, %xmm14
 ; AVX2-SLOW-NEXT:    vpor %xmm12, %xmm14, %xmm12
@@ -7624,9 +7752,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,4,11,128,128,0,7,14,128,128,u,u,u,u,u>
 ; AVX2-SLOW-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
 ; AVX2-SLOW-NEXT:    vpor %xmm2, %xmm1, %xmm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,u,u,u,u,u,u,3,10,128,128,128>
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm9, %xmm7
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,u,u,u,u,u,128,128,1,8,15>
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm8, %xmm10, %xmm13
 ; AVX2-SLOW-NEXT:    vpor %xmm7, %xmm13, %xmm7
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
@@ -7699,7 +7827,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vpor %xmm2, %xmm0, %xmm4
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm9, %xmm0
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3],ymm9[4,5],ymm0[6],ymm9[7,8,9,10],ymm0[11],ymm9[12,13],ymm0[14],ymm9[15]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11]
+; AVX2-SLOW-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm9, %ymm0, %ymm2
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm0, %ymm4, %ymm2, %ymm2
@@ -7721,7 +7850,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm5, %xmm3
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5,6],ymm3[7,8],ymm5[9,10],ymm3[11],ymm5[12,13,14],ymm3[15]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12]
+; AVX2-SLOW-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm2
 ; AVX2-SLOW-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
@@ -7740,7 +7870,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vpor %xmm1, %xmm3, %xmm1
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm8, %xmm3
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm8[1,2,3],ymm3[4],ymm8[5,6],ymm3[7,8],ymm8[9,10,11],ymm3[12],ymm8[13,14],ymm3[15]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13]
+; AVX2-SLOW-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm6, %ymm3, %ymm3
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm0, %ymm1, %ymm3, %ymm3
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm14, %xmm1
@@ -7761,7 +7892,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm8, %xmm7
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13,14,15]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14]
+; AVX2-SLOW-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm8, %ymm7, %ymm7
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm0, %ymm4, %ymm7, %ymm4
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
@@ -7784,7 +7916,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm10, %xmm8
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2,3,4],ymm8[5],ymm10[6,7,8],ymm8[9],ymm10[10,11,12],ymm8[13],ymm10[14,15]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15]
+; AVX2-SLOW-NEXT:    # ymm10 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm10, %ymm8, %ymm8
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm0, %ymm7, %ymm8, %ymm7
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
@@ -7804,10 +7937,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,0,7,14,128,128,3,10,128,128,128,u,u,u,u>
 ; AVX2-SLOW-NEXT:    vpshufb %xmm9, %xmm10, %xmm10
 ; AVX2-SLOW-NEXT:    vpor %xmm6, %xmm10, %xmm6
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm13 = <u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm13, %xmm10, %xmm10
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [6,13,6,13,6,13,6,13,6,13,6,13,6,13,6,13]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm11, %xmm12, %xmm12
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
@@ -7912,7 +8045,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm3, %xmm4
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,0,7,14,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9]
+; AVX2-FAST-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm4, %ymm4
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm12, %ymm0, %ymm4, %ymm0
@@ -7946,7 +8080,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u>
 ; AVX2-FAST-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
 ; AVX2-FAST-NEXT:    vpor %xmm4, %xmm3, %xmm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10]
+; AVX2-FAST-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX2-FAST-NEXT:    vpblendvb %ymm1, %ymm2, %ymm3, %ymm13
@@ -7978,7 +8113,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm10, %ymm5
 ; AVX2-FAST-NEXT:    vmovdqa %ymm0, %ymm6
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,10,13>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm11 = [0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13]
 ; AVX2-FAST-NEXT:    vpshufb %ymm11, %ymm5, %ymm5
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
@@ -8011,7 +8146,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [1,3,0,2,1,3,4,6]
 ; AVX2-FAST-NEXT:    vpermd %ymm6, %ymm9, %ymm11
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,4,11,14>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm12 = [1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14]
 ; AVX2-FAST-NEXT:    vpshufb %ymm12, %ymm11, %ymm11
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm1, %ymm13, %ymm0, %ymm0
@@ -8037,11 +8172,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,1,8,15,128,128,4,11,128,128,u,u,u,u,u>
 ; AVX2-FAST-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,u,u,u,u,u,128,128,128,5,12>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12]
 ; AVX2-FAST-NEXT:    vmovdqa 432(%rdi), %xmm1
 ; AVX2-FAST-NEXT:    vpshufb %xmm9, %xmm1, %xmm10
 ; AVX2-FAST-NEXT:    vmovdqa %xmm1, %xmm14
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,u,u,u,u,u,0,7,14,128,128>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm11 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128]
 ; AVX2-FAST-NEXT:    vmovdqa 416(%rdi), %xmm15
 ; AVX2-FAST-NEXT:    vpshufb %xmm11, %xmm15, %xmm12
 ; AVX2-FAST-NEXT:    vpor %xmm10, %xmm12, %xmm10
@@ -8077,10 +8212,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,128,128,0,7,14,128,128,3,10,u,u,u,u,u>
 ; AVX2-FAST-NEXT:    vpshufb %xmm12, %xmm9, %xmm9
 ; AVX2-FAST-NEXT:    vpor %xmm11, %xmm9, %xmm9
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,u,u,u,u,u,128,128,128,6,13>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm11 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13]
 ; AVX2-FAST-NEXT:    vmovdqa %xmm14, %xmm13
 ; AVX2-FAST-NEXT:    vpshufb %xmm11, %xmm14, %xmm14
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,u,u,u,u,u,1,8,15,128,128>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128]
 ; AVX2-FAST-NEXT:    vmovdqa %xmm15, %xmm4
 ; AVX2-FAST-NEXT:    vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm15, %xmm15
@@ -8112,9 +8247,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,128,128,1,8,15,128,128,4,11,u,u,u,u,u>
 ; AVX2-FAST-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
 ; AVX2-FAST-NEXT:    vpor %xmm1, %xmm11, %xmm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,u,u,u,u,u,2,9,128,128,128>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm11 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128]
 ; AVX2-FAST-NEXT:    vpshufb %xmm11, %xmm4, %xmm14
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm15 = <u,u,u,u,u,u,u,u,u,u,u,128,128,0,7,14>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm15 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14]
 ; AVX2-FAST-NEXT:    vpshufb %xmm15, %xmm13, %xmm10
 ; AVX2-FAST-NEXT:    vpor %xmm14, %xmm10, %xmm10
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
@@ -8145,10 +8280,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,4,11,128,128,0,7,14,128,128,u,u,u,u,u>
 ; AVX2-FAST-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
 ; AVX2-FAST-NEXT:    vpor %xmm2, %xmm1, %xmm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,u,u,u,u,u,u,3,10,128,128,128>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm7, %xmm7
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,u,u,u,u,u,u,u,u,u,128,128,1,8,15>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15]
 ; AVX2-FAST-NEXT:    vpshufb %xmm12, %xmm13, %xmm11
 ; AVX2-FAST-NEXT:    vpor %xmm7, %xmm11, %xmm7
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
@@ -8221,7 +8356,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpor %xmm1, %xmm3, %xmm1
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm10, %xmm3
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm3 = ymm10[0,1,2],ymm3[3],ymm10[4,5],ymm3[6],ymm10[7,8,9,10],ymm3[11],ymm10[12,13],ymm3[14],ymm10[15]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11]
+; AVX2-FAST-NEXT:    # ymm13 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm13, %ymm3, %ymm3
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm10, %ymm1, %ymm3, %ymm3
@@ -8241,7 +8377,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpor %xmm1, %xmm9, %xmm1
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm2, %xmm9
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm9[0],ymm2[1,2],ymm9[3],ymm2[4,5,6],ymm9[7,8],ymm2[9,10],ymm9[11],ymm2[12,13,14],ymm9[15]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12]
+; AVX2-FAST-NEXT:    # ymm13 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm13, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpblendvb %ymm10, %ymm1, %ymm2, %ymm9
 ; AVX2-FAST-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
@@ -8260,7 +8397,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm6, %xmm1
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1,2,3],ymm1[4],ymm6[5,6],ymm1[7,8],ymm6[9,10,11],ymm1[12],ymm6[13,14],ymm1[15]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13]
+; AVX2-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm5, %ymm1, %ymm1
 ; AVX2-FAST-NEXT:    vpblendvb %ymm10, %ymm0, %ymm1, %ymm1
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm11, %xmm0
@@ -8281,7 +8419,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm7, %xmm6
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14]
+; AVX2-FAST-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm7, %ymm6, %ymm6
 ; AVX2-FAST-NEXT:    vpblendvb %ymm10, %ymm2, %ymm6, %ymm2
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
@@ -8304,7 +8443,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm11, %xmm7
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm7 = ymm11[0],ymm7[1],ymm11[2,3,4],ymm7[5],ymm11[6,7,8],ymm7[9],ymm11[10,11,12],ymm7[13],ymm11[14,15]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15]
+; AVX2-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm11, %ymm7, %ymm7
 ; AVX2-FAST-NEXT:    vpblendvb %ymm10, %ymm6, %ymm7, %ymm6
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
@@ -8327,7 +8467,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [1,3,1,2,1,3,5,6]
 ; AVX2-FAST-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm13 # 32-byte Folded Reload
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,5,8,15>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm14 = [2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15]
 ; AVX2-FAST-NEXT:    vpshufb %ymm14, %ymm13, %ymm13
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm7 = ymm3[0],ymm7[1,2,3,4,5,6,7],ymm3[8],ymm7[9,10,11,12,13,14,15]
@@ -8422,7 +8562,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm3, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,0,7,14,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9]
+; AVX2-FAST-PERLANE-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm3, %ymm4, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm4, %ymm0, %ymm5, %ymm0
@@ -8458,7 +8599,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm10 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u>
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm10, %xmm4, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm5, %xmm4, %xmm4
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10]
+; AVX2-FAST-PERLANE-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u>
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm9, %ymm2, %ymm4, %ymm2
@@ -8487,11 +8629,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,u,5,12,128,128,1,8,15,128,128,u,u,u,u>
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm12, %xmm4, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm5, %xmm4, %xmm4
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 432(%rdi), %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm1, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 416(%rdi), %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm3, %xmm14
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -8529,9 +8671,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,u,128,128,4,11,128,128,0,7,14,u,u,u,u>
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm0, %xmm9, %xmm0
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm9 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm9, %xmm1, %xmm13
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm15 = <u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm15 = [5,12,5,12,5,12,5,12,5,12,5,12,5,12,5,12]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm15, %xmm3, %xmm8
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3]
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
@@ -8563,10 +8705,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,1,8,15,128,128,4,11,128,128,u,u,u,u,u>
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,u,u,u,u,u,128,128,128,5,12>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm15, %xmm8
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,u,u,u,u,u,0,7,14,128,128>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm9, %xmm14, %xmm12
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm8, %xmm12, %xmm8
@@ -8598,9 +8740,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,128,128,0,7,14,128,128,3,10,u,u,u,u,u>
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm6, %xmm1, %xmm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,u,u,u,128,128,128,6,13>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm6, %xmm15, %xmm9
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,u,u,u,u,u,u,u,u,u,1,8,15,128,128>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm12, %xmm14, %xmm13
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm9, %xmm13, %xmm9
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
@@ -8630,10 +8772,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,128,128,1,8,15,128,128,4,11,u,u,u,u,u>
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm6, %xmm1, %xmm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,u,u,u,2,9,128,128,128>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm14, %xmm9
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm6, %xmm14, %xmm12
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm13 = <u,u,u,u,u,u,u,u,u,u,u,128,128,0,7,14>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm13 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm13, %xmm10, %xmm14
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm12, %xmm14, %xmm12
@@ -8669,9 +8811,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,4,11,128,128,0,7,14,128,128,u,u,u,u,u>
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm2, %xmm1, %xmm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,u,u,u,u,u,u,3,10,128,128,128>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm9, %xmm7
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,u,u,u,u,u,128,128,1,8,15>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm8, %xmm10, %xmm13
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm7, %xmm13, %xmm7
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
@@ -8744,7 +8886,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm2, %xmm0, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm9, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3],ymm9[4,5],ymm0[6],ymm9[7,8,9,10],ymm0[11],ymm9[12,13],ymm0[14],ymm9[15]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11]
+; AVX2-FAST-PERLANE-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm9, %ymm0, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm0, %ymm4, %ymm2, %ymm2
@@ -8766,7 +8909,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm5, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5,6],ymm3[7,8],ymm5[9,10],ymm3[11],ymm5[12,13,14],ymm3[15]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12]
+; AVX2-FAST-PERLANE-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
@@ -8785,7 +8929,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm1, %xmm3, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm8, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm8[1,2,3],ymm3[4],ymm8[5,6],ymm3[7,8],ymm8[9,10,11],ymm3[12],ymm8[13,14],ymm3[15]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13]
+; AVX2-FAST-PERLANE-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm6, %ymm3, %ymm3
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm0, %ymm1, %ymm3, %ymm3
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm14, %xmm1
@@ -8806,7 +8951,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm8, %xmm7
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13,14,15]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14]
+; AVX2-FAST-PERLANE-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm8, %ymm7, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm0, %ymm4, %ymm7, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
@@ -8829,7 +8975,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm10, %xmm8
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2,3,4],ymm8[5],ymm10[6,7,8],ymm8[9],ymm10[10,11,12],ymm8[13],ymm10[14,15]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15]
+; AVX2-FAST-PERLANE-NEXT:    # ymm10 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm10, %ymm8, %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm0, %ymm7, %ymm8, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
@@ -8849,10 +8996,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,0,7,14,128,128,3,10,128,128,128,u,u,u,u>
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm9, %xmm10, %xmm10
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm6, %xmm10, %xmm6
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm13 = <u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm13, %xmm10, %xmm10
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [6,13,6,13,6,13,6,13,6,13,6,13,6,13,6,13]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm11, %xmm12, %xmm12
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
@@ -8959,7 +9106,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-ONLY-SLOW-NEXT:    vpor %xmm3, %xmm1, %xmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rdi), %xmm8
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u>
+; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %xmm0, %xmm8, %xmm3
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %xmm0, %xmm28
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 208(%rdi), %xmm7
@@ -9258,7 +9405,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u]
 ; AVX512F-ONLY-SLOW-NEXT:    vpor %xmm5, %xmm0, %xmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm12[0],ymm11[1],ymm12[2,3,4],ymm11[5],ymm12[6,7,8],ymm11[9],ymm12[10,11,12],ymm11[13],ymm12[14,15]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512F-ONLY-SLOW-NEXT:    vpternlogq $248, %ymm18, %ymm0, %ymm5
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %xmm8, %xmm10, %xmm0
@@ -9370,7 +9517,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpor %xmm2, %xmm1, %xmm1
 ; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,1,2,4,6>
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,2,4,6,1,2,4,6]
+; AVX512F-ONLY-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rdi), %ymm1
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %ymm1, %ymm5, %ymm5
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
@@ -9436,7 +9584,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpor %xmm5, %xmm2, %xmm2
 ; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,1,3,4,6>
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,3,4,6,1,3,4,6]
+; AVX512F-ONLY-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %ymm1, %ymm5, %ymm5
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7]
@@ -9488,7 +9637,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpor %xmm5, %xmm2, %xmm2
 ; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,1,3,5,6>
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,3,5,6,1,3,5,6]
+; AVX512F-ONLY-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %ymm1, %ymm5, %ymm1
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
@@ -9663,7 +9813,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpor %xmm7, %xmm0, %xmm0
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm7 = ymm12[0],ymm11[1],ymm12[2,3,4],ymm11[5],ymm12[6,7,8],ymm11[9],ymm12[10,11,12],ymm11[13],ymm12[14,15]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512F-ONLY-FAST-NEXT:    vpternlogq $248, %ymm3, %ymm0, %ymm7
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
@@ -10071,7 +10221,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-SLOW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u]
 ; AVX512DQ-SLOW-NEXT:    vpor %xmm0, %xmm3, %xmm0
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm9[0],ymm1[1],ymm9[2,3,4],ymm1[5],ymm9[6,7,8],ymm1[9],ymm9[10,11,12],ymm1[13],ymm9[14,15]
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm11 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512DQ-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm11 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
 ; AVX512DQ-SLOW-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512DQ-SLOW-NEXT:    vpternlogq $248, %ymm18, %ymm0, %ymm3
 ; AVX512DQ-SLOW-NEXT:    vpshufb %xmm11, %xmm12, %xmm0
@@ -10180,7 +10330,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpor %xmm3, %xmm1, %xmm1
 ; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm3
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,1,2,4,6>
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [1,2,4,6,1,2,4,6]
+; AVX512DQ-FAST-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rdi), %ymm1
 ; AVX512DQ-FAST-NEXT:    vpermd %ymm1, %ymm6, %ymm6
 ; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
@@ -10246,7 +10397,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpor %xmm5, %xmm2, %xmm2
 ; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,1,3,4,6>
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,3,4,6,1,3,4,6]
+; AVX512DQ-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512DQ-FAST-NEXT:    vpermd %ymm1, %ymm5, %ymm5
 ; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7]
@@ -10299,7 +10451,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpor %xmm5, %xmm2, %xmm2
 ; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,1,3,5,6>
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,3,5,6,1,3,5,6]
+; AVX512DQ-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512DQ-FAST-NEXT:    vpermd %ymm1, %ymm5, %ymm1
 ; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
@@ -10471,7 +10624,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpor %xmm5, %xmm4, %xmm4
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm5 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7,8],ymm0[9],ymm12[10,11,12],ymm0[13],ymm12[14,15]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm11 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
 ; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512DQ-FAST-NEXT:    vpternlogq $248, %ymm21, %ymm4, %ymm5
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
@@ -10586,7 +10739,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-ONLY-SLOW-NEXT:    vpor %xmm4, %xmm3, %xmm3
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rdi), %xmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm21 = <u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u>
+; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %xmm21, %xmm7, %xmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 208(%rdi), %xmm9
 ; AVX512BW-ONLY-SLOW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
@@ -10866,7 +11019,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-ONLY-SLOW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u,u,u]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpor %xmm1, %xmm9, %xmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vpshufb {{.*#+}} ymm1 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %xmm8, %xmm4, %xmm9
 ; AVX512BW-ONLY-SLOW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3]
@@ -10966,7 +11119,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u]
 ; AVX512BW-ONLY-FAST-NEXT:    vpor %xmm5, %xmm4, %xmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,1,2,4,6>
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,2,4,6,1,2,4,6]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %ymm19
 ; AVX512BW-ONLY-FAST-NEXT:    vpermd %ymm19, %ymm5, %ymm5
 ; AVX512BW-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
@@ -11020,7 +11174,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u,u],zero,zero,xmm20[4,11],zero,zero,xmm20[0,7,14,u,u,u,u]
 ; AVX512BW-ONLY-FAST-NEXT:    vporq %xmm23, %xmm20, %xmm20
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm14
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm20 = <u,u,u,u,1,3,4,6>
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm20 = [1,3,4,6,1,3,4,6]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm20 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermd %ymm19, %ymm20, %ymm20
 ; AVX512BW-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm15 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7]
@@ -11043,7 +11198,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,0,7,14],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[u,u,u,u]
 ; AVX512BW-ONLY-FAST-NEXT:    vporq %xmm20, %xmm15, %xmm15
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm20 = <u,u,u,u,1,3,5,6>
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm20 = [1,3,5,6,1,3,5,6]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm20 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermd %ymm19, %ymm20, %ymm19
 ; AVX512BW-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
@@ -11219,7 +11375,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u]
 ; AVX512BW-ONLY-FAST-NEXT:    vporq %xmm21, %xmm0, %xmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm0 {%k5} = ymm1[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-ONLY-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
 ; AVX512BW-ONLY-FAST-NEXT:    vpshufb %xmm1, %xmm7, %xmm21
 ; AVX512BW-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm22 = xmm6[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm21 = xmm21[0],xmm22[0],xmm21[1],xmm22[1],xmm21[2],xmm22[2],xmm21[3],xmm22[3]
@@ -11321,7 +11477,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQBW-SLOW-NEXT:    vpor %xmm4, %xmm3, %xmm3
 ; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rdi), %xmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm21 = <u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u>
+; AVX512DQBW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
 ; AVX512DQBW-SLOW-NEXT:    vpshufb %xmm21, %xmm7, %xmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa 208(%rdi), %xmm9
 ; AVX512DQBW-SLOW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
@@ -11601,7 +11757,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQBW-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u]
 ; AVX512DQBW-SLOW-NEXT:    vpor %xmm0, %xmm9, %xmm0
 ; AVX512DQBW-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512DQBW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
 ; AVX512DQBW-SLOW-NEXT:    vpshufb %xmm8, %xmm4, %xmm9
 ; AVX512DQBW-SLOW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQBW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3]
@@ -11698,7 +11854,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQBW-FAST-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u]
 ; AVX512DQBW-FAST-NEXT:    vpor %xmm5, %xmm4, %xmm4
 ; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQBW-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,1,2,4,6>
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,2,4,6,1,2,4,6]
+; AVX512DQBW-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdi), %ymm19
 ; AVX512DQBW-FAST-NEXT:    vpermd %ymm19, %ymm5, %ymm5
 ; AVX512DQBW-FAST-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
@@ -11751,7 +11908,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQBW-FAST-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u,u],zero,zero,xmm20[4,11],zero,zero,xmm20[0,7,14,u,u,u,u]
 ; AVX512DQBW-FAST-NEXT:    vporq %xmm23, %xmm20, %xmm20
 ; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm14
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm20 = <u,u,u,u,1,3,4,6>
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm20 = [1,3,4,6,1,3,4,6]
+; AVX512DQBW-FAST-NEXT:    # ymm20 = mem[0,1,0,1]
 ; AVX512DQBW-FAST-NEXT:    vpermd %ymm19, %ymm20, %ymm20
 ; AVX512DQBW-FAST-NEXT:    vpshufb {{.*#+}} ymm15 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
 ; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7]
@@ -11774,7 +11932,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQBW-FAST-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,0,7,14],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[u,u,u,u]
 ; AVX512DQBW-FAST-NEXT:    vporq %xmm20, %xmm15, %xmm15
 ; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm20 = <u,u,u,u,1,3,5,6>
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm20 = [1,3,5,6,1,3,5,6]
+; AVX512DQBW-FAST-NEXT:    # ymm20 = mem[0,1,0,1]
 ; AVX512DQBW-FAST-NEXT:    vpermd %ymm19, %ymm20, %ymm19
 ; AVX512DQBW-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
 ; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
@@ -11949,7 +12108,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQBW-FAST-NEXT:    vpshufb {{.*#+}} xmm21 = xmm21[4,11],zero,zero,xmm21[0,7,14],zero,zero,xmm21[u,u,u,u,u,u,u]
 ; AVX512DQBW-FAST-NEXT:    vporq %xmm20, %xmm21, %xmm20
 ; AVX512DQBW-FAST-NEXT:    vpshufb {{.*#+}} ymm20 {%k5} = ymm1[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQBW-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512DQBW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
 ; AVX512DQBW-FAST-NEXT:    vpshufb %xmm1, %xmm7, %xmm21
 ; AVX512DQBW-FAST-NEXT:    vpshufb {{.*#+}} xmm22 = xmm5[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQBW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm21 = xmm21[0],xmm22[0],xmm21[1],xmm22[1],xmm21[2],xmm22[2],xmm21[3],xmm22[3]

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
index 9cbb3fea50c7d..657c353b82335 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
@@ -235,54 +235,103 @@ define void @load_i8_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movd %xmm1, (%rax)
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: load_i8_stride8_vf4:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX1-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX1-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX1-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
-; AVX1-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
-; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm5
-; AVX1-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm5, %xmm2, %xmm6
-; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm5
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm6, %xmm2, %xmm7
-; AVX1-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm7, %xmm2, %xmm8
-; AVX1-NEXT:    vpshufb %xmm7, %xmm1, %xmm7
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm8, %xmm2, %xmm9
-; AVX1-NEXT:    vpshufb %xmm8, %xmm1, %xmm8
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm9 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX1-NEXT:    vmovd %xmm0, (%rsi)
-; AVX1-NEXT:    vmovd %xmm3, (%rdx)
-; AVX1-NEXT:    vmovd %xmm4, (%rcx)
-; AVX1-NEXT:    vmovd %xmm5, (%r8)
-; AVX1-NEXT:    vmovd %xmm6, (%r9)
-; AVX1-NEXT:    vmovd %xmm7, (%r11)
-; AVX1-NEXT:    vmovd %xmm8, (%r10)
-; AVX1-NEXT:    vmovd %xmm1, (%rax)
-; AVX1-NEXT:    retq
+; AVX1-ONLY-LABEL: load_i8_stride8_vf4:
+; AVX1-ONLY:       # %bb.0:
+; AVX1-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX1-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX1-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX1-ONLY-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
+; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
+; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
+; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm2, %xmm5
+; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
+; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm2, %xmm6
+; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm1, %xmm5
+; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vpshufb %xmm6, %xmm2, %xmm7
+; AVX1-ONLY-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
+; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vpshufb %xmm7, %xmm2, %xmm8
+; AVX1-ONLY-NEXT:    vpshufb %xmm7, %xmm1, %xmm7
+; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm8 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vpshufb %xmm8, %xmm2, %xmm9
+; AVX1-ONLY-NEXT:    vpshufb %xmm8, %xmm1, %xmm8
+; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm9 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
+; AVX1-ONLY-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
+; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX1-ONLY-NEXT:    vmovd %xmm0, (%rsi)
+; AVX1-ONLY-NEXT:    vmovd %xmm3, (%rdx)
+; AVX1-ONLY-NEXT:    vmovd %xmm4, (%rcx)
+; AVX1-ONLY-NEXT:    vmovd %xmm5, (%r8)
+; AVX1-ONLY-NEXT:    vmovd %xmm6, (%r9)
+; AVX1-ONLY-NEXT:    vmovd %xmm7, (%r11)
+; AVX1-ONLY-NEXT:    vmovd %xmm8, (%r10)
+; AVX1-ONLY-NEXT:    vmovd %xmm1, (%rax)
+; AVX1-ONLY-NEXT:    retq
+;
+; AVX2-ONLY-LABEL: load_i8_stride8_vf4:
+; AVX2-ONLY:       # %bb.0:
+; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-ONLY-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX2-ONLY-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
+; AVX2-ONLY-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-ONLY-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
+; AVX2-ONLY-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
+; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-ONLY-NEXT:    vpshufb %xmm4, %xmm2, %xmm5
+; AVX2-ONLY-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
+; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-ONLY-NEXT:    vpshufb %xmm5, %xmm2, %xmm6
+; AVX2-ONLY-NEXT:    vpshufb %xmm5, %xmm1, %xmm5
+; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-ONLY-NEXT:    vpshufb %xmm6, %xmm2, %xmm7
+; AVX2-ONLY-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
+; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm7 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-ONLY-NEXT:    vpshufb %xmm7, %xmm2, %xmm8
+; AVX2-ONLY-NEXT:    vpshufb %xmm7, %xmm1, %xmm7
+; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-ONLY-NEXT:    vpshufb %xmm8, %xmm2, %xmm9
+; AVX2-ONLY-NEXT:    vpshufb %xmm8, %xmm1, %xmm8
+; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm9 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-ONLY-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
+; AVX2-ONLY-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
+; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX2-ONLY-NEXT:    vmovd %xmm0, (%rsi)
+; AVX2-ONLY-NEXT:    vmovd %xmm3, (%rdx)
+; AVX2-ONLY-NEXT:    vmovd %xmm4, (%rcx)
+; AVX2-ONLY-NEXT:    vmovd %xmm5, (%r8)
+; AVX2-ONLY-NEXT:    vmovd %xmm6, (%r9)
+; AVX2-ONLY-NEXT:    vmovd %xmm7, (%r11)
+; AVX2-ONLY-NEXT:    vmovd %xmm8, (%r10)
+; AVX2-ONLY-NEXT:    vmovd %xmm1, (%rax)
+; AVX2-ONLY-NEXT:    retq
 ;
 ; AVX512-LABEL: load_i8_stride8_vf4:
 ; AVX512:       # %bb.0:
@@ -638,74 +687,74 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX2-ONLY-NEXT:    vmovdqa 32(%rdi), %xmm2
 ; AVX2-ONLY-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
 ; AVX2-ONLY-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
 ; AVX2-ONLY-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
 ; AVX2-ONLY-NEXT:    vpshufb %xmm5, %xmm2, %xmm5
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm6, %xmm1, %xmm7
 ; AVX2-ONLY-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm6, %xmm3, %xmm7
 ; AVX2-ONLY-NEXT:    vpshufb %xmm6, %xmm2, %xmm6
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm7, %xmm1, %xmm8
 ; AVX2-ONLY-NEXT:    vpshufb %xmm7, %xmm0, %xmm7
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm7 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm7, %xmm3, %xmm8
 ; AVX2-ONLY-NEXT:    vpshufb %xmm7, %xmm2, %xmm7
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm8 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm8, %xmm1, %xmm9
 ; AVX2-ONLY-NEXT:    vpshufb %xmm8, %xmm0, %xmm8
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm8, %xmm3, %xmm9
 ; AVX2-ONLY-NEXT:    vpshufb %xmm8, %xmm2, %xmm8
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm9 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm9, %xmm1, %xmm10
 ; AVX2-ONLY-NEXT:    vpshufb %xmm9, %xmm0, %xmm9
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm9 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm9, %xmm3, %xmm10
 ; AVX2-ONLY-NEXT:    vpshufb %xmm9, %xmm2, %xmm9
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm10 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm10, %xmm1, %xmm11
 ; AVX2-ONLY-NEXT:    vpshufb %xmm10, %xmm0, %xmm10
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm10, %xmm3, %xmm11
 ; AVX2-ONLY-NEXT:    vpshufb %xmm10, %xmm2, %xmm10
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm11 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm11, %xmm1, %xmm12
 ; AVX2-ONLY-NEXT:    vpshufb %xmm11, %xmm0, %xmm11
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm11, %xmm3, %xmm3
 ; AVX2-ONLY-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX2-ONLY-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -1403,13 +1452,13 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-LABEL: load_i8_stride8_vf16:
 ; AVX2-ONLY:       # %bb.0:
 ; AVX2-ONLY-NEXT:    vmovdqa 112(%rdi), %xmm8
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm0, %xmm8, %xmm2
 ; AVX2-ONLY-NEXT:    vmovdqa 96(%rdi), %xmm3
 ; AVX2-ONLY-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; AVX2-ONLY-NEXT:    vmovdqa 80(%rdi), %xmm4
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm2, %xmm4, %xmm6
 ; AVX2-ONLY-NEXT:    vmovdqa 64(%rdi), %xmm5
 ; AVX2-ONLY-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
@@ -1419,147 +1468,147 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vmovdqa 16(%rdi), %xmm2
 ; AVX2-ONLY-NEXT:    vmovdqa 32(%rdi), %xmm6
 ; AVX2-ONLY-NEXT:    vmovdqa 48(%rdi), %xmm7
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm9, %xmm7, %xmm10
 ; AVX2-ONLY-NEXT:    vpshufb %xmm9, %xmm6, %xmm9
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm10 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm10, %xmm2, %xmm11
 ; AVX2-ONLY-NEXT:    vpshufb %xmm10, %xmm1, %xmm10
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3]
 ; AVX2-ONLY-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm9 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm9, %xmm8, %xmm10
 ; AVX2-ONLY-NEXT:    vpshufb %xmm9, %xmm3, %xmm9
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm10, %xmm4, %xmm11
 ; AVX2-ONLY-NEXT:    vpshufb %xmm10, %xmm5, %xmm10
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm10, %xmm7, %xmm11
 ; AVX2-ONLY-NEXT:    vpshufb %xmm10, %xmm6, %xmm10
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm11 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm11, %xmm2, %xmm12
 ; AVX2-ONLY-NEXT:    vpshufb %xmm11, %xmm1, %xmm11
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm9[2,3]
 ; AVX2-ONLY-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm10, %xmm8, %xmm11
 ; AVX2-ONLY-NEXT:    vpshufb %xmm10, %xmm3, %xmm10
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm11, %xmm4, %xmm12
 ; AVX2-ONLY-NEXT:    vpshufb %xmm11, %xmm5, %xmm11
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm11, %xmm7, %xmm12
 ; AVX2-ONLY-NEXT:    vpshufb %xmm11, %xmm6, %xmm11
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm12 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm12, %xmm2, %xmm13
 ; AVX2-ONLY-NEXT:    vpshufb %xmm12, %xmm1, %xmm12
 ; AVX2-ONLY-NEXT:    vmovdqa %xmm1, %xmm9
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm11, %xmm8, %xmm12
 ; AVX2-ONLY-NEXT:    vpshufb %xmm11, %xmm3, %xmm11
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm12, %xmm4, %xmm13
 ; AVX2-ONLY-NEXT:    vpshufb %xmm12, %xmm5, %xmm12
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm12, %xmm7, %xmm13
 ; AVX2-ONLY-NEXT:    vpshufb %xmm12, %xmm6, %xmm12
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm13 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm13, %xmm2, %xmm14
 ; AVX2-ONLY-NEXT:    vpshufb %xmm13, %xmm1, %xmm13
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm12, %xmm8, %xmm13
 ; AVX2-ONLY-NEXT:    vpshufb %xmm12, %xmm3, %xmm12
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm13 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm13, %xmm4, %xmm14
 ; AVX2-ONLY-NEXT:    vpshufb %xmm13, %xmm5, %xmm13
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm12 = xmm13[0,1,2],xmm12[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm13 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm13, %xmm7, %xmm14
 ; AVX2-ONLY-NEXT:    vpshufb %xmm13, %xmm6, %xmm13
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm14 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm14, %xmm2, %xmm15
 ; AVX2-ONLY-NEXT:    vpshufb %xmm14, %xmm1, %xmm14
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm13 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm13, %xmm8, %xmm14
 ; AVX2-ONLY-NEXT:    vpshufb %xmm13, %xmm3, %xmm13
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm14, %xmm4, %xmm15
 ; AVX2-ONLY-NEXT:    vpshufb %xmm14, %xmm5, %xmm14
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm14, %xmm7, %xmm15
 ; AVX2-ONLY-NEXT:    vpshufb %xmm14, %xmm6, %xmm14
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm15 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm15 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm15, %xmm2, %xmm0
 ; AVX2-ONLY-NEXT:    vpshufb %xmm15, %xmm1, %xmm15
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm0, %xmm8, %xmm14
 ; AVX2-ONLY-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm14, %xmm4, %xmm15
 ; AVX2-ONLY-NEXT:    vpshufb %xmm14, %xmm5, %xmm14
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],xmm0[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm14, %xmm7, %xmm15
 ; AVX2-ONLY-NEXT:    vpshufb %xmm14, %xmm6, %xmm14
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm15 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm15 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm15, %xmm2, %xmm0
 ; AVX2-ONLY-NEXT:    vpshufb %xmm15, %xmm9, %xmm15
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm1, %xmm8, %xmm8
 ; AVX2-ONLY-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
 ; AVX2-ONLY-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm3, %xmm7, %xmm4
 ; AVX2-ONLY-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
 ; AVX2-ONLY-NEXT:    vpshufb %xmm4, %xmm9, %xmm4
 ; AVX2-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
@@ -1580,121 +1629,237 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vmovdqa %xmm1, (%rax)
 ; AVX2-ONLY-NEXT:    retq
 ;
-; AVX512-LABEL: load_i8_stride8_vf16:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vmovdqa 112(%rdi), %xmm0
-; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm3
-; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm1
-; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm2
-; AVX512-NEXT:    vpshufb %xmm5, %xmm2, %xmm6
-; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm3
-; AVX512-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
-; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm5
-; AVX512-NEXT:    vpmovqb %zmm5, %xmm6
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm6, %xmm0, %xmm7
-; AVX512-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm7, %xmm2, %xmm8
-; AVX512-NEXT:    vpshufb %xmm7, %xmm3, %xmm7
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
-; AVX512-NEXT:    vpsrlq $8, %zmm5, %zmm7
-; AVX512-NEXT:    vpmovqb %zmm7, %xmm7
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm7, %xmm0, %xmm8
-; AVX512-NEXT:    vpshufb %xmm7, %xmm1, %xmm7
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm8, %xmm2, %xmm9
-; AVX512-NEXT:    vpshufb %xmm8, %xmm3, %xmm8
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
-; AVX512-NEXT:    vpsrlq $16, %zmm5, %zmm8
-; AVX512-NEXT:    vpmovqb %zmm8, %xmm8
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm8, %xmm0, %xmm9
-; AVX512-NEXT:    vpshufb %xmm8, %xmm1, %xmm8
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm9, %xmm2, %xmm10
-; AVX512-NEXT:    vpshufb %xmm9, %xmm3, %xmm9
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512-NEXT:    vpsrlq $24, %zmm5, %zmm9
-; AVX512-NEXT:    vpmovqb %zmm9, %xmm9
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm9, %xmm0, %xmm10
-; AVX512-NEXT:    vpshufb %xmm9, %xmm1, %xmm9
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm10, %xmm2, %xmm11
-; AVX512-NEXT:    vpshufb %xmm10, %xmm3, %xmm10
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
-; AVX512-NEXT:    vpsrlq $32, %zmm5, %zmm10
-; AVX512-NEXT:    vpmovqb %zmm10, %xmm10
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm10, %xmm0, %xmm11
-; AVX512-NEXT:    vpshufb %xmm10, %xmm1, %xmm10
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm11, %xmm2, %xmm12
-; AVX512-NEXT:    vpshufb %xmm11, %xmm3, %xmm11
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX512-NEXT:    vpsrlq $40, %zmm5, %zmm11
-; AVX512-NEXT:    vpmovqb %zmm11, %xmm11
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm11, %xmm0, %xmm12
-; AVX512-NEXT:    vpshufb %xmm11, %xmm1, %xmm11
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm12, %xmm2, %xmm13
-; AVX512-NEXT:    vpshufb %xmm12, %xmm3, %xmm12
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
-; AVX512-NEXT:    vpsrlq $48, %zmm5, %zmm12
-; AVX512-NEXT:    vpmovqb %zmm12, %xmm12
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
-; AVX512-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512-NEXT:    vpsrlq $56, %zmm5, %zmm1
-; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512-NEXT:    vmovdqa %xmm4, (%rsi)
-; AVX512-NEXT:    vmovdqa %xmm6, (%rdx)
-; AVX512-NEXT:    vmovdqa %xmm7, (%rcx)
-; AVX512-NEXT:    vmovdqa %xmm8, (%r8)
-; AVX512-NEXT:    vmovdqa %xmm9, (%r9)
-; AVX512-NEXT:    vmovdqa %xmm10, (%r11)
-; AVX512-NEXT:    vmovdqa %xmm11, (%r10)
-; AVX512-NEXT:    vmovdqa %xmm0, (%rax)
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: load_i8_stride8_vf16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
+; AVX512F-NEXT:    vmovdqa 112(%rdi), %xmm0
+; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm3
+; AVX512F-NEXT:    vmovdqa 96(%rdi), %xmm1
+; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX512F-NEXT:    vmovdqa 80(%rdi), %xmm2
+; AVX512F-NEXT:    vpshufb %xmm5, %xmm2, %xmm6
+; AVX512F-NEXT:    vmovdqa 64(%rdi), %xmm3
+; AVX512F-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm5
+; AVX512F-NEXT:    vpmovqb %zmm5, %xmm6
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX512F-NEXT:    vpshufb %xmm6, %xmm0, %xmm7
+; AVX512F-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm7 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX512F-NEXT:    vpshufb %xmm7, %xmm2, %xmm8
+; AVX512F-NEXT:    vpshufb %xmm7, %xmm3, %xmm7
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
+; AVX512F-NEXT:    vpsrlq $8, %zmm5, %zmm7
+; AVX512F-NEXT:    vpmovqb %zmm7, %xmm7
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
+; AVX512F-NEXT:    vpshufb %xmm7, %xmm0, %xmm8
+; AVX512F-NEXT:    vpshufb %xmm7, %xmm1, %xmm7
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX512F-NEXT:    vpshufb %xmm8, %xmm2, %xmm9
+; AVX512F-NEXT:    vpshufb %xmm8, %xmm3, %xmm8
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
+; AVX512F-NEXT:    vpsrlq $16, %zmm5, %zmm8
+; AVX512F-NEXT:    vpmovqb %zmm8, %xmm8
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
+; AVX512F-NEXT:    vpshufb %xmm8, %xmm0, %xmm9
+; AVX512F-NEXT:    vpshufb %xmm8, %xmm1, %xmm8
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX512F-NEXT:    vpshufb %xmm9, %xmm2, %xmm10
+; AVX512F-NEXT:    vpshufb %xmm9, %xmm3, %xmm9
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
+; AVX512F-NEXT:    vpsrlq $24, %zmm5, %zmm9
+; AVX512F-NEXT:    vpmovqb %zmm9, %xmm9
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
+; AVX512F-NEXT:    vpshufb %xmm9, %xmm0, %xmm10
+; AVX512F-NEXT:    vpshufb %xmm9, %xmm1, %xmm9
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm10 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX512F-NEXT:    vpshufb %xmm10, %xmm2, %xmm11
+; AVX512F-NEXT:    vpshufb %xmm10, %xmm3, %xmm10
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
+; AVX512F-NEXT:    vpsrlq $32, %zmm5, %zmm10
+; AVX512F-NEXT:    vpmovqb %zmm10, %xmm10
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm10 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
+; AVX512F-NEXT:    vpshufb %xmm10, %xmm0, %xmm11
+; AVX512F-NEXT:    vpshufb %xmm10, %xmm1, %xmm10
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm11 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX512F-NEXT:    vpshufb %xmm11, %xmm2, %xmm12
+; AVX512F-NEXT:    vpshufb %xmm11, %xmm3, %xmm11
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
+; AVX512F-NEXT:    vpsrlq $40, %zmm5, %zmm11
+; AVX512F-NEXT:    vpmovqb %zmm11, %xmm11
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm11 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
+; AVX512F-NEXT:    vpshufb %xmm11, %xmm0, %xmm12
+; AVX512F-NEXT:    vpshufb %xmm11, %xmm1, %xmm11
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm12 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX512F-NEXT:    vpshufb %xmm12, %xmm2, %xmm13
+; AVX512F-NEXT:    vpshufb %xmm12, %xmm3, %xmm12
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
+; AVX512F-NEXT:    vpsrlq $48, %zmm5, %zmm12
+; AVX512F-NEXT:    vpmovqb %zmm12, %xmm12
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm12 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
+; AVX512F-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
+; AVX512F-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX512F-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
+; AVX512F-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX512F-NEXT:    vpsrlq $56, %zmm5, %zmm1
+; AVX512F-NEXT:    vpmovqb %zmm1, %xmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512F-NEXT:    vmovdqa %xmm4, (%rsi)
+; AVX512F-NEXT:    vmovdqa %xmm6, (%rdx)
+; AVX512F-NEXT:    vmovdqa %xmm7, (%rcx)
+; AVX512F-NEXT:    vmovdqa %xmm8, (%r8)
+; AVX512F-NEXT:    vmovdqa %xmm9, (%r9)
+; AVX512F-NEXT:    vmovdqa %xmm10, (%r11)
+; AVX512F-NEXT:    vmovdqa %xmm11, (%r10)
+; AVX512F-NEXT:    vmovdqa %xmm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: load_i8_stride8_vf16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX512BW-NEXT:    vmovdqa 112(%rdi), %xmm0
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm3
+; AVX512BW-NEXT:    vmovdqa 96(%rdi), %xmm1
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX512BW-NEXT:    vmovdqa 80(%rdi), %xmm2
+; AVX512BW-NEXT:    vpshufb %xmm5, %xmm2, %xmm6
+; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm3
+; AVX512BW-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm5
+; AVX512BW-NEXT:    vpmovqb %zmm5, %xmm6
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512BW-NEXT:    vpshufb %xmm6, %xmm0, %xmm7
+; AVX512BW-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm7 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512BW-NEXT:    vpshufb %xmm7, %xmm2, %xmm8
+; AVX512BW-NEXT:    vpshufb %xmm7, %xmm3, %xmm7
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
+; AVX512BW-NEXT:    vpsrlq $8, %zmm5, %zmm7
+; AVX512BW-NEXT:    vpmovqb %zmm7, %xmm7
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512BW-NEXT:    vpshufb %xmm7, %xmm0, %xmm8
+; AVX512BW-NEXT:    vpshufb %xmm7, %xmm1, %xmm7
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm2, %xmm9
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm3, %xmm8
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
+; AVX512BW-NEXT:    vpsrlq $16, %zmm5, %zmm8
+; AVX512BW-NEXT:    vpmovqb %zmm8, %xmm8
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm0, %xmm9
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm1, %xmm8
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512BW-NEXT:    vpshufb %xmm9, %xmm2, %xmm10
+; AVX512BW-NEXT:    vpshufb %xmm9, %xmm3, %xmm9
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
+; AVX512BW-NEXT:    vpsrlq $24, %zmm5, %zmm9
+; AVX512BW-NEXT:    vpmovqb %zmm9, %xmm9
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512BW-NEXT:    vpshufb %xmm9, %xmm0, %xmm10
+; AVX512BW-NEXT:    vpshufb %xmm9, %xmm1, %xmm9
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512BW-NEXT:    vpshufb %xmm10, %xmm2, %xmm11
+; AVX512BW-NEXT:    vpshufb %xmm10, %xmm3, %xmm10
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
+; AVX512BW-NEXT:    vpsrlq $32, %zmm5, %zmm10
+; AVX512BW-NEXT:    vpmovqb %zmm10, %xmm10
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512BW-NEXT:    vpshufb %xmm10, %xmm0, %xmm11
+; AVX512BW-NEXT:    vpshufb %xmm10, %xmm1, %xmm10
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512BW-NEXT:    vpshufb %xmm11, %xmm2, %xmm12
+; AVX512BW-NEXT:    vpshufb %xmm11, %xmm3, %xmm11
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
+; AVX512BW-NEXT:    vpsrlq $40, %zmm5, %zmm11
+; AVX512BW-NEXT:    vpmovqb %zmm11, %xmm11
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512BW-NEXT:    vpshufb %xmm11, %xmm0, %xmm12
+; AVX512BW-NEXT:    vpshufb %xmm11, %xmm1, %xmm11
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512BW-NEXT:    vpshufb %xmm12, %xmm2, %xmm13
+; AVX512BW-NEXT:    vpshufb %xmm12, %xmm3, %xmm12
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
+; AVX512BW-NEXT:    vpsrlq $48, %zmm5, %zmm12
+; AVX512BW-NEXT:    vpmovqb %zmm12, %xmm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512BW-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512BW-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX512BW-NEXT:    vpsrlq $56, %zmm5, %zmm1
+; AVX512BW-NEXT:    vpmovqb %zmm1, %xmm1
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512BW-NEXT:    vmovdqa %xmm4, (%rsi)
+; AVX512BW-NEXT:    vmovdqa %xmm6, (%rdx)
+; AVX512BW-NEXT:    vmovdqa %xmm7, (%rcx)
+; AVX512BW-NEXT:    vmovdqa %xmm8, (%r8)
+; AVX512BW-NEXT:    vmovdqa %xmm9, (%r9)
+; AVX512BW-NEXT:    vmovdqa %xmm10, (%r11)
+; AVX512BW-NEXT:    vmovdqa %xmm11, (%r10)
+; AVX512BW-NEXT:    vmovdqa %xmm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %wide.vec = load <128 x i8>, ptr %in.vec, align 64
   %strided.vec0 = shufflevector <128 x i8> %wide.vec, <128 x i8> poison, <16 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120>
   %strided.vec1 = shufflevector <128 x i8> %wide.vec, <128 x i8> poison, <16 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 65, i32 73, i32 81, i32 89, i32 97, i32 105, i32 113, i32 121>
@@ -3099,7 +3264,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW:       # %bb.0:
 ; AVX2-SLOW-NEXT:    subq $360, %rsp # imm = 0x168
 ; AVX2-SLOW-NEXT:    vmovdqa 112(%rdi), %xmm4
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm10, %xmm4, %xmm0
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-SLOW-NEXT:    vmovdqa 96(%rdi), %xmm1
@@ -3109,7 +3274,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
 ; AVX2-SLOW-NEXT:    vmovdqa 80(%rdi), %xmm1
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm12, %xmm1, %xmm2
 ; AVX2-SLOW-NEXT:    vmovdqa 64(%rdi), %xmm5
 ; AVX2-SLOW-NEXT:    vpshufb %xmm12, %xmm5, %xmm6
@@ -3124,11 +3289,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-SLOW-NEXT:    vmovdqa 48(%rdi), %xmm3
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm14, %xmm3, %xmm9
 ; AVX2-SLOW-NEXT:    vpshufb %xmm14, %xmm2, %xmm11
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm1, %xmm11
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm0, %xmm13
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
@@ -3172,23 +3337,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm8, %xmm4, %xmm1
 ; AVX2-SLOW-NEXT:    vpshufb %xmm8, %xmm7, %xmm2
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm6, %xmm3
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm5, %xmm4
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm14, %xmm4
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm15, %xmm5
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX2-SLOW-NEXT:    vmovdqa (%rsp), %xmm11 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm5, %xmm11, %xmm0
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
@@ -3224,23 +3389,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm9, %xmm1
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm6, %xmm3
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm8, %xmm4
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm14, %xmm4
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm15, %xmm5
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm5, %xmm11, %xmm6
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm11, %xmm15
 ; AVX2-SLOW-NEXT:    vpshufb %xmm5, %xmm13, %xmm7
@@ -3274,24 +3439,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm9, %xmm1
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm14, %xmm2
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm8, %xmm4
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm9, %xmm4
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm5, %xmm15, %xmm6
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm5, %xmm8, %xmm7
@@ -3323,24 +3488,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm10, %xmm1
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm14, %xmm2
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm12, %xmm3
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm14, %xmm4
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm9, %xmm4
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm11, %xmm5
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX2-SLOW-NEXT:    vmovdqa (%rsp), %xmm6 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm5, %xmm6, %xmm6
 ; AVX2-SLOW-NEXT:    vpshufb %xmm5, %xmm8, %xmm7
@@ -3374,22 +3539,22 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm10, %xmm1
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm8, %xmm2
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm12, %xmm3
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm14, %xmm4
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm11, %xmm5
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX2-SLOW-NEXT:    vmovdqa (%rsp), %xmm11 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm5, %xmm11, %xmm6
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
@@ -3424,25 +3589,25 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm8, %xmm2
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm15, %xmm4
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm5, %xmm11, %xmm6
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm5, %xmm8, %xmm7
@@ -3474,25 +3639,25 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm15, %xmm4
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-SLOW-NEXT:    vmovdqa (%rsp), %xmm6 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm5, %xmm6, %xmm6
 ; AVX2-SLOW-NEXT:    vpshufb %xmm5, %xmm8, %xmm7
@@ -3552,7 +3717,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vmovdqu %ymm6, (%rsp) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vmovdqa 112(%rdi), %xmm1
 ; AVX2-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-FAST-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
 ; AVX2-FAST-NEXT:    vmovdqa 96(%rdi), %xmm2
 ; AVX2-FAST-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
@@ -3560,7 +3725,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX2-FAST-NEXT:    vmovdqa 80(%rdi), %xmm2
 ; AVX2-FAST-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
 ; AVX2-FAST-NEXT:    vmovdqa 64(%rdi), %xmm3
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
@@ -3572,11 +3737,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vmovdqa 16(%rdi), %xmm5
 ; AVX2-FAST-NEXT:    vmovdqa 32(%rdi), %xmm13
 ; AVX2-FAST-NEXT:    vmovdqa 48(%rdi), %xmm10
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm10, %xmm9
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm13, %xmm1
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-FAST-NEXT:    vpshufb %xmm9, %xmm5, %xmm11
 ; AVX2-FAST-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FAST-NEXT:    vpshufb %xmm9, %xmm2, %xmm9
@@ -3601,23 +3766,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm0, %xmm11, %xmm2
 ; AVX2-FAST-NEXT:    vpshufb %xmm0, %xmm8, %xmm0
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm4, %xmm3
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm10, %xmm3
 ; AVX2-FAST-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm13, %xmm2
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm5, %xmm12
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
@@ -3633,20 +3798,20 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX2-FAST-NEXT:    vpshufb %xmm0, %xmm11, %xmm2
 ; AVX2-FAST-NEXT:    vpshufb %xmm0, %xmm8, %xmm0
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm4, %xmm3
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm10, %xmm3
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm13, %xmm2
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm13, %xmm12
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
@@ -3663,26 +3828,26 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX2-FAST-NEXT:    vmovdqa %xmm11, %xmm5
 ; AVX2-FAST-NEXT:    vpshufb %xmm0, %xmm11, %xmm2
 ; AVX2-FAST-NEXT:    vpshufb %xmm0, %xmm8, %xmm0
 ; AVX2-FAST-NEXT:    vmovdqa %xmm8, %xmm11
 ; AVX2-FAST-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm4, %xmm3
 ; AVX2-FAST-NEXT:    vmovdqa %xmm4, %xmm8
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm10, %xmm3
 ; AVX2-FAST-NEXT:    vmovdqa %xmm10, %xmm4
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm10, %xmm2
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm13, %xmm12
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3]
@@ -3697,23 +3862,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX2-FAST-NEXT:    vpshufb %xmm0, %xmm5, %xmm1
 ; AVX2-FAST-NEXT:    vpshufb %xmm0, %xmm11, %xmm0
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX2-FAST-NEXT:    vmovdqa %xmm8, %xmm6
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm8, %xmm2
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
 ; AVX2-FAST-NEXT:    vmovdqa %xmm7, %xmm14
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm4, %xmm2
 ; AVX2-FAST-NEXT:    vmovdqa %xmm4, %xmm15
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm10, %xmm1
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm13, %xmm3
 ; AVX2-FAST-NEXT:    vmovdqa %xmm13, %xmm9
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
@@ -3735,22 +3900,22 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX2-FAST-NEXT:    vmovdqa %xmm5, %xmm1
 ; AVX2-FAST-NEXT:    vpshufb %xmm0, %xmm5, %xmm2
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm6, %xmm3
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm14, %xmm2
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm15, %xmm3
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm10, %xmm2
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm9, %xmm4
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm8, %xmm3
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
@@ -3765,21 +3930,21 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX2-FAST-NEXT:    vpshufb %xmm0, %xmm1, %xmm3
 ; AVX2-FAST-NEXT:    vmovdqa %xmm1, %xmm2
 ; AVX2-FAST-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm6, %xmm4
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm14, %xmm3
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm15, %xmm4
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm10, %xmm3
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX2-FAST-NEXT:    vpshufb %xmm4, %xmm9, %xmm0
 ; AVX2-FAST-NEXT:    vpshufb %xmm4, %xmm8, %xmm4
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
@@ -3793,20 +3958,20 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm6, %xmm4
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm14, %xmm3
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm15, %xmm4
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm10, %xmm3
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-FAST-NEXT:    vpshufb %xmm4, %xmm9, %xmm6
 ; AVX2-FAST-NEXT:    vpshufb %xmm4, %xmm8, %xmm4
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
@@ -3845,7 +4010,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE:       # %bb.0:
 ; AVX2-FAST-PERLANE-NEXT:    subq $360, %rsp # imm = 0x168
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 112(%rdi), %xmm4
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm10, %xmm4, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 96(%rdi), %xmm1
@@ -3855,7 +4020,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 80(%rdi), %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm12, %xmm1, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 64(%rdi), %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm12, %xmm5, %xmm6
@@ -3870,11 +4035,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 48(%rdi), %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm14, %xmm3, %xmm9
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm14, %xmm2, %xmm11
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm1, %xmm11
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm0, %xmm13
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
@@ -3918,23 +4083,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm8, %xmm4, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm8, %xmm7, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm6, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm5, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm14, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm15, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm5 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rsp), %xmm11 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm5, %xmm11, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
@@ -3970,23 +4135,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm9, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm6, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm8, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm14, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm15, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm5 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm5, %xmm11, %xmm6
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm11, %xmm15
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm5, %xmm13, %xmm7
@@ -4020,24 +4185,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm9, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm14, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm8, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm9, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm5 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm5, %xmm15, %xmm6
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm5, %xmm8, %xmm7
@@ -4069,24 +4234,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm10, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm14, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm12, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm14, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm9, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm11, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm5 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rsp), %xmm6 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm5, %xmm6, %xmm6
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm5, %xmm8, %xmm7
@@ -4120,22 +4285,22 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm10, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm8, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm12, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm14, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm11, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm5 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rsp), %xmm11 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm5, %xmm11, %xmm6
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
@@ -4170,25 +4335,25 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm8, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm15, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm5 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm5, %xmm11, %xmm6
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm5, %xmm8, %xmm7
@@ -4220,25 +4385,25 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm15, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm5 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rsp), %xmm6 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm5, %xmm6, %xmm6
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm5, %xmm8, %xmm7
@@ -4295,7 +4460,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm0
 ; AVX512F-SLOW-NEXT:    vpmovqb %zmm0, %xmm0
 ; AVX512F-SLOW-NEXT:    vmovdqa 240(%rdi), %xmm2
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm4, %xmm2, %xmm1
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm2, %xmm12
 ; AVX512F-SLOW-NEXT:    vmovdqa 224(%rdi), %xmm2
@@ -4304,7 +4469,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512F-SLOW-NEXT:    vmovdqa 208(%rdi), %xmm2
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm5, %xmm2, %xmm3
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm2, %xmm7
 ; AVX512F-SLOW-NEXT:    vmovdqa 192(%rdi), %xmm2
@@ -4339,21 +4504,21 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm0, %ymm17
 ; AVX512F-SLOW-NEXT:    vmovdqa 160(%rdi), %xmm0
 ; AVX512F-SLOW-NEXT:    vmovdqa 176(%rdi), %xmm1
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm12, %xmm4
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm12, %xmm5
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm6, %xmm12
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm6, %xmm8
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3]
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm14 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm14, %xmm7, %xmm12
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm14, %xmm15, %xmm13
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm15, %xmm10
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5,6],ymm5[7]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm12 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm12, %xmm1, %xmm13
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm1, %xmm24
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm12, %xmm0, %xmm12
@@ -4361,7 +4526,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
 ; AVX512F-SLOW-NEXT:    vmovdqa 128(%rdi), %xmm3
 ; AVX512F-SLOW-NEXT:    vmovdqa 144(%rdi), %xmm6
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm1, %xmm6, %xmm0
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm6, %xmm13
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
@@ -4386,11 +4551,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm0, %ymm18
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm4, %xmm1
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm8, %xmm2
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm7, %xmm5
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm10, %xmm14
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm10, %xmm23
@@ -4398,13 +4563,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm24, %xmm10
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm5, %xmm10, %xmm14
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm21, %xmm10
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm5, %xmm10, %xmm5
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm14 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm14, %xmm13, %xmm15
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm14, %xmm12, %xmm14
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
@@ -4428,13 +4593,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm0, %ymm19
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm4, %xmm1
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm4, %xmm21
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm8, %xmm2
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm8, %xmm25
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm7, %xmm5
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm23, %xmm9
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm9, %xmm14
@@ -4442,13 +4607,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm24, %xmm11
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm5, %xmm11, %xmm14
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm5, %xmm10, %xmm5
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm10, %xmm26
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm14 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm14, %xmm13, %xmm15
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm14, %xmm12, %xmm14
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
@@ -4472,25 +4637,25 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm0, %ymm20
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm21, %xmm6
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm6, %xmm1
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm25, %xmm0
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm0, %xmm2
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm7, %xmm5
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm9, %xmm14
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3]
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm5, %xmm11, %xmm14
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm26, %xmm0
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm14 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm14, %xmm13, %xmm0
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm13, %xmm28
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm14, %xmm12, %xmm14
@@ -4515,13 +4680,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm0, %ymm21
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm12 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm12, %xmm6, %xmm1
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm6, %xmm13
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm25, %xmm4
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm12, %xmm4, %xmm2
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm7, %xmm3
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm7, %xmm25
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm9, %xmm5
@@ -4530,13 +4695,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm11, %xmm5
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm11, %xmm27
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm26, %xmm6
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm28, %xmm11
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm5, %xmm11, %xmm0
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm24, %xmm8
@@ -4561,13 +4726,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm0, %ymm22
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm9, %xmm13, %xmm1
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm13, %xmm12
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm9, %xmm4, %xmm2
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm4, %xmm13
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm25, %xmm10
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm10, %xmm3
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm15, %xmm4
@@ -4576,12 +4741,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm27, %xmm0
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm0, %xmm4
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm4, %xmm11, %xmm0
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm28, %xmm15
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm4, %xmm8, %xmm4
@@ -4604,11 +4769,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vpmovqb %zmm2, %xmm2
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm1, %xmm12, %xmm2
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm1, %xmm13, %xmm3
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm10, %xmm4
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm25, %xmm5
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
@@ -4616,12 +4781,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm27, %xmm5
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm4, %xmm6, %xmm4
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm5, %xmm15, %xmm6
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm5, %xmm11, %xmm5
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
@@ -4658,7 +4823,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-LABEL: load_i8_stride8_vf32:
 ; AVX512F-FAST:       # %bb.0:
 ; AVX512F-FAST-NEXT:    vmovdqa64 (%rdi), %zmm18
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
 ; AVX512F-FAST-NEXT:    vmovdqa64 224(%rdi), %ymm16
 ; AVX512F-FAST-NEXT:    vpermd %ymm16, %ymm0, %ymm4
@@ -4675,14 +4840,14 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
 ; AVX512F-FAST-NEXT:    vmovdqa 112(%rdi), %xmm1
 ; AVX512F-FAST-NEXT:    vpshufb %xmm3, %xmm1, %xmm5
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm1, %xmm2
 ; AVX512F-FAST-NEXT:    vmovdqa 96(%rdi), %xmm1
 ; AVX512F-FAST-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
 ; AVX512F-FAST-NEXT:    vmovdqa 80(%rdi), %xmm12
 ; AVX512F-FAST-NEXT:    vpshufb %xmm8, %xmm12, %xmm10
 ; AVX512F-FAST-NEXT:    vmovdqa 64(%rdi), %xmm5
@@ -4696,19 +4861,19 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,6],ymm0[7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm3, %ymm9, %ymm7
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm3, %ymm27
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm3, %ymm11, %ymm8
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm3, %ymm28
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm7 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm7, %xmm2, %xmm8
 ; AVX512F-FAST-NEXT:    vpshufb %xmm7, %xmm1, %xmm7
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm8, %xmm12, %xmm10
 ; AVX512F-FAST-NEXT:    vpshufb %xmm8, %xmm5, %xmm8
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3]
@@ -4718,27 +4883,27 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm0, %ymm20
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm3, %ymm4, %ymm0
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm3, %ymm29
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm3, %ymm6, %ymm7
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm3, %ymm30
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,6],ymm0[7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm3, %ymm9, %ymm7
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm3, %ymm31
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm3, %ymm11, %ymm8
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm3, %ymm26
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm7, %xmm2, %xmm8
 ; AVX512F-FAST-NEXT:    vpshufb %xmm7, %xmm1, %xmm7
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm1, %xmm3
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm8, %xmm12, %xmm10
 ; AVX512F-FAST-NEXT:    vpshufb %xmm8, %xmm5, %xmm8
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3]
@@ -4748,25 +4913,25 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm0, %ymm21
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm1, %ymm4, %ymm0
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm1, %ymm25
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm1, %ymm6, %ymm4
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm1, %ymm24
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm1, %ymm9, %ymm4
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm1, %ymm23
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm8, %ymm11, %ymm6
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm4, %xmm2, %xmm6
 ; AVX512F-FAST-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm6, %xmm12, %xmm9
 ; AVX512F-FAST-NEXT:    vpshufb %xmm6, %xmm5, %xmm6
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
@@ -4788,12 +4953,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5],ymm4[6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm14 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm2, %xmm6
 ; AVX512F-FAST-NEXT:    vpshufb %xmm14, %xmm2, %xmm15
 ; AVX512F-FAST-NEXT:    vpshufb %xmm14, %xmm3, %xmm14
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm15 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm15 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm15, %xmm12, %xmm0
 ; AVX512F-FAST-NEXT:    vpshufb %xmm15, %xmm5, %xmm15
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
@@ -4811,12 +4976,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm14
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm4, %xmm6, %xmm14
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm3, %xmm7
 ; AVX512F-FAST-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm14 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm14, %xmm12, %xmm0
 ; AVX512F-FAST-NEXT:    vpshufb %xmm14, %xmm5, %xmm14
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
@@ -4836,11 +5001,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vpshufb %ymm4, %ymm1, %ymm14
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5],ymm14[6,7]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm2, %xmm6, %xmm14
 ; AVX512F-FAST-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm14 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm14, %xmm12, %xmm0
 ; AVX512F-FAST-NEXT:    vpshufb %xmm14, %xmm5, %xmm14
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
@@ -4859,11 +5024,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm2, %xmm6, %xmm3
 ; AVX512F-FAST-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm3, %xmm12, %xmm8
 ; AVX512F-FAST-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
@@ -4895,14 +5060,14 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm0
 ; AVX512BW-SLOW-NEXT:    vpmovqb %zmm0, %xmm0
 ; AVX512BW-SLOW-NEXT:    vmovdqa 240(%rdi), %xmm2
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm4, %xmm2, %xmm1
 ; AVX512BW-SLOW-NEXT:    vmovdqa 224(%rdi), %xmm6
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm4, %xmm6, %xmm3
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512BW-SLOW-NEXT:    vmovdqa 208(%rdi), %xmm7
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm5, %xmm7, %xmm3
 ; AVX512BW-SLOW-NEXT:    vmovdqa 192(%rdi), %xmm9
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm5, %xmm9, %xmm8
@@ -4934,23 +5099,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-SLOW-NEXT:    vmovdqa 144(%rdi), %xmm11
 ; AVX512BW-SLOW-NEXT:    vmovdqa 160(%rdi), %xmm12
 ; AVX512BW-SLOW-NEXT:    vmovdqa 176(%rdi), %xmm13
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm0, %xmm2, %xmm5
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm0, %xmm6, %xmm14
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3]
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm14, %xmm7, %xmm15
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm14, %xmm9, %xmm17
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3]
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm15 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm15 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm15, %xmm13, %xmm17
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm15, %xmm12, %xmm15
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm17 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm17, %xmm11, %xmm18
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm17, %xmm19, %xmm17
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
@@ -4969,23 +5134,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %ymm0, %ymm21
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm0, %xmm2, %xmm5
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm0, %xmm6, %xmm10
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm10, %xmm7, %xmm14
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm10, %xmm9, %xmm15
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm14, %xmm13, %xmm15
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm14, %xmm12, %xmm14
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm15 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm15 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm15, %xmm11, %xmm17
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm15, %xmm19, %xmm15
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
@@ -5004,23 +5169,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %ymm0, %ymm22
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm0, %xmm2, %xmm5
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm0, %xmm6, %xmm10
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm10, %xmm7, %xmm14
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm10, %xmm9, %xmm15
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm14, %xmm13, %xmm15
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm14, %xmm12, %xmm14
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm15 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm15 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm15, %xmm11, %xmm17
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm15, %xmm19, %xmm15
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
@@ -5039,23 +5204,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %ymm0, %ymm23
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm0, %xmm2, %xmm5
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm0, %xmm6, %xmm10
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm10, %xmm7, %xmm14
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm10, %xmm9, %xmm17
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm17[0],xmm14[0],xmm17[1],xmm14[1],xmm17[2],xmm14[2],xmm17[3],xmm14[3]
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm14, %xmm13, %xmm17
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm14, %xmm12, %xmm14
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm17[0],xmm14[1],xmm17[1],xmm14[2],xmm17[2],xmm14[3],xmm17[3]
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm17 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm17, %xmm11, %xmm18
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm17, %xmm19, %xmm17
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
@@ -5074,23 +5239,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %ymm0, %ymm24
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm0, %xmm2, %xmm5
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm0, %xmm6, %xmm10
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm10, %xmm7, %xmm15
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm10, %xmm9, %xmm17
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3]
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm15 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm15 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm15, %xmm13, %xmm17
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm15, %xmm12, %xmm15
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm17 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm17, %xmm11, %xmm18
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm17, %xmm19, %xmm17
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
@@ -5109,23 +5274,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %ymm0, %ymm25
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm0, %xmm2, %xmm10
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm0, %xmm6, %xmm14
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm14, %xmm7, %xmm15
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm14, %xmm9, %xmm17
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3]
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4,5,6],ymm10[7]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm15 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm15 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm15, %xmm13, %xmm17
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm15, %xmm12, %xmm15
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm17 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm17, %xmm11, %xmm18
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm17, %xmm19, %xmm17
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
@@ -5143,23 +5308,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-SLOW-NEXT:    vpmovqb %zmm10, %xmm10
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm5, %xmm6, %xmm6
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm6, %xmm7, %xmm7
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm6, %xmm9, %xmm9
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm7 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm7, %xmm13, %xmm9
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm7, %xmm12, %xmm7
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm9 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm9 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm9, %xmm11, %xmm10
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm9, %xmm19, %xmm9
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
@@ -5194,33 +5359,33 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512BW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6]
 ; AVX512BW-FAST-NEXT:    vmovdqa 224(%rdi), %ymm9
 ; AVX512BW-FAST-NEXT:    vpermd %ymm9, %ymm1, %ymm4
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm12, %ymm4, %ymm2
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm30 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm30 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512BW-FAST-NEXT:    vmovdqa 192(%rdi), %ymm11
 ; AVX512BW-FAST-NEXT:    vpermd %ymm11, %ymm1, %ymm26
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm30, %ymm26, %ymm3
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm31 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm31 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512BW-FAST-NEXT:    vmovdqa 160(%rdi), %ymm13
 ; AVX512BW-FAST-NEXT:    vpermd %ymm13, %ymm1, %ymm27
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm31, %ymm27, %ymm3
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512BW-FAST-NEXT:    vmovdqa 128(%rdi), %ymm15
 ; AVX512BW-FAST-NEXT:    vpermd %ymm15, %ymm1, %ymm28
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm14, %ymm28, %ymm1
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX512BW-FAST-NEXT:    vmovdqa 112(%rdi), %xmm1
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm3, %xmm1, %xmm5
 ; AVX512BW-FAST-NEXT:    vmovdqa 96(%rdi), %xmm2
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm3, %xmm2, %xmm3
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} xmm16 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm16 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX512BW-FAST-NEXT:    vmovdqa 80(%rdi), %xmm3
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm16, %xmm3, %xmm17
 ; AVX512BW-FAST-NEXT:    vmovdqa 64(%rdi), %xmm5
@@ -5231,22 +5396,22 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512BW-FAST-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm16 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm16 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm16, %ymm4, %ymm6
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm17 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm17 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm17, %ymm26, %ymm7
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5,6],ymm6[7]
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm18 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm18 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm18, %ymm27, %ymm7
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm19 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm19, %ymm28, %ymm8
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm7 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm7, %xmm1, %xmm8
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm7, %xmm2, %xmm7
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm8, %xmm3, %xmm20
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm8, %xmm5, %xmm8
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm20[0],xmm8[1],xmm20[1],xmm8[2],xmm20[2],xmm8[3],xmm20[3]
@@ -5256,22 +5421,22 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %ymm6, %ymm29
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm20 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm20, %ymm4, %ymm7
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm21 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm21 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm21, %ymm26, %ymm8
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5,6],ymm7[7]
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm22 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm22, %ymm27, %ymm8
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm23 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm23 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm23, %ymm28, %ymm10
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7]
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm8, %xmm1, %xmm10
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm8, %xmm2, %xmm8
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm10, %xmm3, %xmm24
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm10, %xmm5, %xmm10
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm24[0],xmm10[1],xmm24[1],xmm10[2],xmm24[2],xmm10[3],xmm24[3]
@@ -5280,22 +5445,22 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FAST-NEXT:    vpmovqb %zmm10, %xmm10
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3]
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm24 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm24 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm24, %ymm4, %ymm4
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm25 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm25 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm25, %ymm26, %ymm8
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,6],ymm4[7]
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm26 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm26 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm26, %ymm27, %ymm8
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm27 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm27 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm27, %ymm28, %ymm10
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7]
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm8, %xmm1, %xmm10
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm8, %xmm2, %xmm8
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm10, %xmm3, %xmm28
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm10, %xmm5, %xmm10
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm28[0],xmm10[1],xmm28[1],xmm10[2],xmm28[2],xmm10[3],xmm28[3]
@@ -5316,11 +5481,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm14, %ymm15, %ymm8
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5],ymm8[6,7]
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm12, %xmm1, %xmm13
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm12, %xmm2, %xmm12
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm13, %xmm3, %xmm14
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm13, %xmm5, %xmm13
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
@@ -5336,11 +5501,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm19, %ymm15, %ymm14
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7]
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm13, %xmm1, %xmm14
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm13, %xmm2, %xmm13
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm14, %xmm3, %xmm16
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm14, %xmm5, %xmm14
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm16[0],xmm14[1],xmm16[1],xmm14[2],xmm16[2],xmm14[3],xmm16[3]
@@ -5356,11 +5521,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm23, %ymm15, %ymm6
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm14[5],ymm6[6,7]
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm13, %xmm1, %xmm14
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm13, %xmm2, %xmm13
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm14, %xmm3, %xmm16
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm14, %xmm5, %xmm14
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm16[0],xmm14[1],xmm16[1],xmm14[2],xmm16[2],xmm14[3],xmm16[3]
@@ -5376,11 +5541,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm27, %ymm15, %ymm11
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5],ymm11[6,7]
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
@@ -8168,7 +8333,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW:       # %bb.0:
 ; AVX2-SLOW-NEXT:    subq $840, %rsp # imm = 0x348
 ; AVX2-SLOW-NEXT:    vmovdqa 368(%rdi), %xmm13
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm9, %xmm13, %xmm1
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-SLOW-NEXT:    vmovdqa 352(%rdi), %xmm12
@@ -8176,7 +8341,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX2-SLOW-NEXT:    vmovdqa 336(%rdi), %xmm4
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm1, %xmm4, %xmm3
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm4, %xmm15
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -8187,14 +8352,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3]
 ; AVX2-SLOW-NEXT:    vmovdqa 304(%rdi), %xmm0
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm0, %xmm3
 ; AVX2-SLOW-NEXT:    vmovdqa 288(%rdi), %xmm11
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm11, %xmm5
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
 ; AVX2-SLOW-NEXT:    vmovdqa 272(%rdi), %xmm10
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm10, %xmm6
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-SLOW-NEXT:    vmovdqa 256(%rdi), %xmm0
@@ -8309,22 +8474,22 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm6, %xmm13, %xmm1
 ; AVX2-SLOW-NEXT:    vpshufb %xmm6, %xmm12, %xmm2
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm15, %xmm2
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm13, %xmm4
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm1, %xmm0, %xmm2
 ; AVX2-SLOW-NEXT:    vpshufb %xmm1, %xmm11, %xmm5
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm10, %xmm0
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm9, %xmm15
@@ -8410,25 +8575,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
 ; AVX2-SLOW-NEXT:    vpshufb %xmm1, %xmm13, %xmm4
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm13, %xmm3
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm9, %xmm14
@@ -8511,25 +8676,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm12, %xmm1
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm1, %xmm4, %xmm4
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm13, %xmm3
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm13, %xmm5
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
@@ -8613,24 +8778,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm12, %xmm1
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm1, %xmm4, %xmm4
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm13, %xmm5
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm7, %xmm6
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
@@ -8716,26 +8881,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm1, %xmm4, %xmm4
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm7, %xmm6
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm7, %xmm14
@@ -8817,26 +8982,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm1, %xmm11, %xmm3
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm1, %xmm4, %xmm4
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
@@ -8921,25 +9086,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm11, %xmm3
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm0[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm10, %xmm14
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
@@ -9063,14 +9228,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    subq $904, %rsp # imm = 0x388
 ; AVX2-FAST-NEXT:    vmovdqa 368(%rdi), %xmm0
 ; AVX2-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vmovdqa 352(%rdi), %xmm2
 ; AVX2-FAST-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
 ; AVX2-FAST-NEXT:    vmovdqa 336(%rdi), %xmm4
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm4, %xmm2
 ; AVX2-FAST-NEXT:    vmovdqa %xmm4, %xmm14
 ; AVX2-FAST-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -9081,7 +9246,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
 ; AVX2-FAST-NEXT:    vmovdqa 304(%rdi), %xmm4
 ; AVX2-FAST-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
 ; AVX2-FAST-NEXT:    vmovdqa 288(%rdi), %xmm5
 ; AVX2-FAST-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -9089,7 +9254,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
 ; AVX2-FAST-NEXT:    vmovdqa 272(%rdi), %xmm5
 ; AVX2-FAST-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-FAST-NEXT:    vpshufb %xmm8, %xmm5, %xmm5
 ; AVX2-FAST-NEXT:    vmovdqa 256(%rdi), %xmm6
 ; AVX2-FAST-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -9107,7 +9272,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28]
 ; AVX2-FAST-NEXT:    vpermd %ymm6, %ymm0, %ymm5
 ; AVX2-FAST-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX2-FAST-NEXT:    vpshufb %ymm6, %ymm5, %ymm5
 ; AVX2-FAST-NEXT:    vmovdqa %ymm6, %ymm15
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3,4,5,6],ymm4[7]
@@ -9117,12 +9282,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpermd %ymm4, %ymm0, %ymm6
 ; AVX2-FAST-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX2-FAST-NEXT:    vpshufb %ymm4, %ymm6, %ymm11
 ; AVX2-FAST-NEXT:    vmovdqa %ymm4, %ymm13
 ; AVX2-FAST-NEXT:    vpermd %ymm5, %ymm0, %ymm5
 ; AVX2-FAST-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX2-FAST-NEXT:    vpshufb %ymm6, %ymm5, %ymm12
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7]
@@ -9184,25 +9349,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm8, %xmm13, %xmm0
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm8, %xmm15, %xmm10
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX2-FAST-NEXT:    vpshufb %xmm10, %xmm14, %xmm12
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm10, %xmm2, %xmm11
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm11[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm11, %xmm0, %xmm12
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm11, %xmm0, %xmm14
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm14 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm14, %xmm3, %xmm0
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
@@ -9258,24 +9423,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX2-FAST-NEXT:    vpshufb %xmm8, %xmm13, %xmm0
 ; AVX2-FAST-NEXT:    vpshufb %xmm8, %xmm15, %xmm1
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm11, %xmm1, %xmm9
 ; AVX2-FAST-NEXT:    vpshufb %xmm11, %xmm2, %xmm10
 ; AVX2-FAST-NEXT:    vmovdqa %xmm11, %xmm15
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm9 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm9, %xmm2, %xmm10
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm9, %xmm1, %xmm11
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX2-FAST-NEXT:    vpshufb %xmm11, %xmm3, %xmm12
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm11, %xmm3, %xmm14
@@ -9328,25 +9493,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm7 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm7, %xmm9
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm7, %xmm10
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX2-FAST-NEXT:    vpshufb %xmm9, %xmm2, %xmm10
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm9, %xmm2, %xmm11
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm11, %xmm8, %xmm12
 ; AVX2-FAST-NEXT:    vpshufb %xmm11, %xmm3, %xmm14
@@ -9355,12 +9520,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm6, %ymm12
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm2, %ymm6, %ymm6
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
@@ -9368,7 +9533,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX2-FAST-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vpshufb %xmm6, %xmm4, %xmm6
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
@@ -9400,26 +9565,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm7 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX2-FAST-NEXT:    vpshufb %xmm4, %xmm8, %xmm2
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
@@ -9431,7 +9596,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
 ; AVX2-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX2-FAST-NEXT:    vpshufb %ymm15, %ymm1, %ymm3
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm4 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3,4,5,6],ymm3[7]
@@ -9440,7 +9605,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
 ; AVX2-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX2-FAST-NEXT:    vpshufb %ymm1, %ymm2, %ymm12
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7]
@@ -9461,7 +9626,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vmovdqa (%rsp), %xmm8 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm7, %xmm8, %xmm7
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX2-FAST-NEXT:    vpshufb %xmm9, %xmm14, %xmm7
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm9, %xmm8, %xmm8
@@ -9485,26 +9650,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm9[4,5,6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm8, %xmm0, %xmm9
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm8, %xmm0, %xmm10
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm10, %xmm0, %xmm11
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm10, %xmm0, %xmm12
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm11, %xmm0, %xmm12
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm11, %xmm0, %xmm13
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm13, %xmm5, %xmm14
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
@@ -9557,25 +9722,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm8, %xmm9, %xmm9
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm8, %xmm10, %xmm10
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm10, %xmm11, %xmm11
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm10, %xmm12, %xmm12
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm11, %xmm12, %xmm12
 ; AVX2-FAST-NEXT:    vpshufb %xmm11, %xmm0, %xmm13
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm13, %xmm0, %xmm14
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
@@ -9625,43 +9790,43 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm9[4,5,6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm9 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm9, %xmm1, %xmm10
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm9, %xmm1, %xmm11
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm11, %xmm1, %xmm12
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm11, %xmm1, %xmm13
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm12, %xmm1, %xmm13
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm12, %xmm1, %xmm14
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm14 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-FAST-NEXT:    vpshufb %xmm14, %xmm0, %xmm15
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm14, %xmm0, %xmm8
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm8 = xmm8[0],xmm13[1],xmm8[2,3]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm10, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm13, %ymm2, %ymm2
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm15, %ymm3, %ymm3
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
@@ -9739,7 +9904,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE:       # %bb.0:
 ; AVX2-FAST-PERLANE-NEXT:    subq $840, %rsp # imm = 0x348
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 368(%rdi), %xmm13
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm9, %xmm13, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 352(%rdi), %xmm12
@@ -9747,7 +9912,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 336(%rdi), %xmm4
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm4, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm4, %xmm15
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -9758,14 +9923,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 304(%rdi), %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm0, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 288(%rdi), %xmm11
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm11, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 272(%rdi), %xmm10
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm10, %xmm6
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 256(%rdi), %xmm0
@@ -9880,22 +10045,22 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm6, %xmm13, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm6, %xmm12, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm15, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm13, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm0, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm11, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm10, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm9, %xmm15
@@ -9981,25 +10146,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm13, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm13, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm9, %xmm14
@@ -10082,25 +10247,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm12, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm4, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm13, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm13, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
@@ -10184,24 +10349,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm12, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm4, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm13, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm7, %xmm6
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
@@ -10287,26 +10452,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm4, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm7, %xmm6
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm7, %xmm14
@@ -10388,26 +10553,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm11, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm4, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
@@ -10492,25 +10657,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm11, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm10, %xmm14
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
@@ -10636,7 +10801,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm0
 ; AVX512F-SLOW-NEXT:    vpmovqb %zmm0, %xmm2
 ; AVX512F-SLOW-NEXT:    vmovdqa 496(%rdi), %xmm15
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm15, %xmm1
 ; AVX512F-SLOW-NEXT:    vmovdqa 480(%rdi), %xmm4
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm4, %xmm3
@@ -10644,7 +10809,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm3
 ; AVX512F-SLOW-NEXT:    vmovdqa 464(%rdi), %xmm13
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm1, %xmm13, %xmm4
 ; AVX512F-SLOW-NEXT:    vmovdqa 448(%rdi), %xmm14
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm1, %xmm14, %xmm5
@@ -10725,12 +10890,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm8, %xmm15, %xmm0
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm8, %xmm9, %xmm2
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm9, %xmm26
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm6, %xmm13, %xmm3
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm6, %xmm14, %xmm4
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
@@ -10739,7 +10904,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512F-SLOW-NEXT:    vmovdqa 416(%rdi), %xmm0
 ; AVX512F-SLOW-NEXT:    vmovdqa 432(%rdi), %xmm1
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm4, %xmm1, %xmm2
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm1, %xmm25
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
@@ -10748,7 +10913,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vmovdqa 384(%rdi), %xmm0
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512F-SLOW-NEXT:    vmovdqa 400(%rdi), %xmm1
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm1, %xmm7
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm1, %xmm28
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm0, %xmm10
@@ -10825,13 +10990,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm0
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm4, %xmm15, %xmm1
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm15, %xmm24
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm26, %xmm0
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm4, %xmm0, %xmm2
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm13, %xmm5
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm1, %xmm13, %xmm3
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm1, %xmm14, %xmm6
@@ -10840,13 +11005,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm25, %xmm14
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm14, %xmm2
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm23, %xmm13
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm13, %xmm8
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm28, %xmm0
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512F-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
@@ -10916,14 +11081,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm0
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm24, %xmm12
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm12, %xmm1
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm26, %xmm3
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm3, %xmm2
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm1, %xmm5, %xmm3
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm5, %xmm29
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm19, %xmm5
@@ -10933,11 +11098,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm14, %xmm3
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm13, %xmm6
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm28, %xmm13
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm13, %xmm8
 ; AVX512F-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
@@ -11004,13 +11169,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm0
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm12, %xmm1
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm26, %xmm2
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm29, %xmm3
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm1, %xmm5, %xmm4
@@ -11018,13 +11183,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm25, %xmm3
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm23, %xmm5
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm5, %xmm6
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm13, %xmm8
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm28, %xmm24
 ; AVX512F-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
@@ -11095,13 +11260,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm0
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
 ; AVX512F-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
 ; AVX512F-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm29, %xmm3
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
 ; AVX512F-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
@@ -11110,13 +11275,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm25, %xmm13
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm13, %xmm3
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm5, %xmm6
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm23, %xmm25
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm28, %xmm5
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm5, %xmm8
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm9, %xmm15
@@ -11184,13 +11349,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm23
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
 ; AVX512F-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
 ; AVX512F-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm29, %xmm4
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm1, %xmm4, %xmm3
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm29, %xmm21
@@ -11200,13 +11365,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm13, %xmm3
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm13, %xmm16
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm25, %xmm6
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm6, %xmm6
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm24, %xmm13
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm13, %xmm8
 ; AVX512F-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
@@ -11273,13 +11438,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm20
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
 ; AVX512F-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
 ; AVX512F-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm21, %xmm3
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
 ; AVX512F-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
@@ -11288,13 +11453,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm16, %xmm3
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm25, %xmm6
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm6, %xmm6
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm13, %xmm8
 ; AVX512F-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm9, %xmm9
@@ -11379,14 +11544,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-LABEL: load_i8_stride8_vf64:
 ; AVX512F-FAST:       # %bb.0:
 ; AVX512F-FAST-NEXT:    subq $408, %rsp # imm = 0x198
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6]
 ; AVX512F-FAST-NEXT:    vmovdqa 480(%rdi), %ymm0
 ; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm11
 ; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm11, %ymm0
 ; AVX512F-FAST-NEXT:    vmovdqa %ymm2, %ymm9
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512F-FAST-NEXT:    vmovdqa 448(%rdi), %ymm2
 ; AVX512F-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-FAST-NEXT:    vpermd %ymm2, %ymm1, %ymm4
@@ -11394,14 +11559,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm4, %ymm25
 ; AVX512F-FAST-NEXT:    vmovdqa %ymm3, %ymm10
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512F-FAST-NEXT:    vmovdqa 416(%rdi), %ymm3
 ; AVX512F-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-FAST-NEXT:    vpermd %ymm3, %ymm1, %ymm4
 ; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm4, %ymm3
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm4, %ymm24
 ; AVX512F-FAST-NEXT:    vmovdqa %ymm2, %ymm12
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512F-FAST-NEXT:    vmovdqa64 384(%rdi), %ymm31
 ; AVX512F-FAST-NEXT:    vpermd %ymm31, %ymm1, %ymm4
 ; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm4, %ymm5
@@ -11409,14 +11574,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vmovdqa %ymm2, %ymm15
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
 ; AVX512F-FAST-NEXT:    vmovdqa 368(%rdi), %xmm2
 ; AVX512F-FAST-NEXT:    vpshufb %xmm3, %xmm2, %xmm5
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm2, %xmm21
 ; AVX512F-FAST-NEXT:    vmovdqa 352(%rdi), %xmm13
 ; AVX512F-FAST-NEXT:    vpshufb %xmm3, %xmm13, %xmm6
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
 ; AVX512F-FAST-NEXT:    vmovdqa 336(%rdi), %xmm2
 ; AVX512F-FAST-NEXT:    vpshufb %xmm5, %xmm2, %xmm7
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm2, %xmm27
@@ -11474,31 +11639,31 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm16, %zmm0
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm7, %ymm11, %ymm0
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm11, %ymm16
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm25, %ymm10
 ; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm10, %ymm1
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm2, %ymm30
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm24, %ymm11
 ; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm11, %ymm1
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm2, %ymm24
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm23, %ymm8
 ; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm8, %ymm3
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm2, %ymm25
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm21, %xmm12
 ; AVX512F-FAST-NEXT:    vpshufb %xmm1, %xmm12, %xmm3
 ; AVX512F-FAST-NEXT:    vpshufb %xmm1, %xmm13, %xmm5
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm13, %xmm21
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm27, %xmm13
 ; AVX512F-FAST-NEXT:    vpshufb %xmm5, %xmm13, %xmm0
 ; AVX512F-FAST-NEXT:    vpshufb %xmm5, %xmm4, %xmm6
@@ -11542,31 +11707,31 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm16, %ymm0
 ; AVX512F-FAST-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm1, %ymm18
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm10, %ymm1
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm10, %ymm24
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm2, %ymm29
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm11, %ymm1
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm11, %ymm25
 ; AVX512F-FAST-NEXT:    vmovdqa %ymm2, %ymm11
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm10, %ymm8, %ymm2
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm8, %ymm27
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm1, %xmm12, %xmm2
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm12, %xmm8
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm21, %xmm12
 ; AVX512F-FAST-NEXT:    vpshufb %xmm1, %xmm12, %xmm3
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm3, %xmm13, %xmm5
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm13, %xmm19
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm23, %xmm13
@@ -11610,31 +11775,31 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm16, %ymm0
 ; AVX512F-FAST-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm1, %ymm16
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm24, %ymm1
 ; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm2, %ymm17
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm25, %ymm1
 ; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX512F-FAST-NEXT:    vmovdqa %ymm2, %ymm14
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm11 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm27, %ymm2
 ; AVX512F-FAST-NEXT:    vpshufb %ymm11, %ymm2, %ymm2
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm1, %xmm8, %xmm2
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm8, %xmm22
 ; AVX512F-FAST-NEXT:    vpshufb %xmm1, %xmm12, %xmm3
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm12, %xmm15
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm19, %xmm12
 ; AVX512F-FAST-NEXT:    vpshufb %xmm3, %xmm12, %xmm5
 ; AVX512F-FAST-NEXT:    vpshufb %xmm3, %xmm13, %xmm6
@@ -11675,32 +11840,32 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
 ; AVX512F-FAST-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm7, %ymm2, %ymm1
 ; AVX512F-FAST-NEXT:    vmovdqa %ymm2, %ymm4
 ; AVX512F-FAST-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm9, %ymm3, %ymm2
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
 ; AVX512F-FAST-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm14, %ymm2, %ymm6
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm2, %ymm28
 ; AVX512F-FAST-NEXT:    vpermd %ymm31, %ymm0, %ymm2
 ; AVX512F-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm31 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm31 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm31, %ymm8
 ; AVX512F-FAST-NEXT:    vpshufb %ymm8, %ymm2, %ymm8
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm22, %xmm2
 ; AVX512F-FAST-NEXT:    vpshufb %xmm6, %xmm2, %xmm8
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm22, %xmm18
 ; AVX512F-FAST-NEXT:    vpshufb %xmm6, %xmm15, %xmm10
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm15, %xmm24
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm11 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm11, %xmm12, %xmm10
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm19, %xmm22
 ; AVX512F-FAST-NEXT:    vpshufb %xmm11, %xmm13, %xmm12
@@ -11746,29 +11911,29 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm12, %zmm0
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm4, %ymm6
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm4, %ymm25
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm21 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm21, %ymm1
 ; AVX512F-FAST-NEXT:    vpshufb %ymm1, %ymm3, %ymm11
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5,6],ymm6[7]
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm17 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm17 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm17, %ymm1
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm28, %ymm0
 ; AVX512F-FAST-NEXT:    vpshufb %ymm1, %ymm0, %ymm11
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX512F-FAST-NEXT:    vpshufb %ymm4, %ymm1, %ymm12
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm11 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm18, %xmm5
 ; AVX512F-FAST-NEXT:    vpshufb %xmm11, %xmm5, %xmm12
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm24, %xmm10
 ; AVX512F-FAST-NEXT:    vpshufb %xmm11, %xmm10, %xmm13
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm13 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm22, %xmm1
 ; AVX512F-FAST-NEXT:    vpshufb %xmm13, %xmm1, %xmm14
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm16, %xmm8
@@ -11809,30 +11974,30 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm12, %zmm31
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm25, %ymm2
 ; AVX512F-FAST-NEXT:    vpshufb %ymm4, %ymm2, %ymm6
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm4, %ymm20
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm3, %ymm11
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5,6],ymm6[7]
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm22 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm22, %ymm4
 ; AVX512F-FAST-NEXT:    vpshufb %ymm4, %ymm0, %ymm11
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm0, %ymm18
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm17 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm17 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm17, %ymm12
 ; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
 ; AVX512F-FAST-NEXT:    vpshufb %ymm12, %ymm4, %ymm12
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm11 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm11, %xmm5, %xmm12
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm5, %xmm16
 ; AVX512F-FAST-NEXT:    vpshufb %xmm11, %xmm10, %xmm13
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm24, %xmm28
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm13 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm13, %xmm1, %xmm14
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm1, %xmm24
 ; AVX512F-FAST-NEXT:    vpshufb %xmm13, %xmm8, %xmm15
@@ -11871,26 +12036,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm12, %zmm17
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm25, %ymm2
 ; AVX512F-FAST-NEXT:    vpshufb %ymm0, %ymm2, %ymm5
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm13, %ymm3, %ymm3
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm18, %ymm2
 ; AVX512F-FAST-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm6, %ymm4, %ymm4
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm16, %xmm4
 ; AVX512F-FAST-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm28, %xmm5
 ; AVX512F-FAST-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm24, %xmm11
 ; AVX512F-FAST-NEXT:    vpshufb %xmm5, %xmm11, %xmm11
 ; AVX512F-FAST-NEXT:    vpshufb %xmm5, %xmm8, %xmm12
@@ -11954,7 +12119,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm2
 ; AVX512BW-SLOW-NEXT:    vpmovqb %zmm2, %xmm2
 ; AVX512BW-SLOW-NEXT:    vmovdqa 496(%rdi), %xmm5
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm15 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm15 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm15, %xmm5, %xmm4
 ; AVX512BW-SLOW-NEXT:    vmovdqa %xmm5, %xmm7
 ; AVX512BW-SLOW-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -11964,7 +12129,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512BW-SLOW-NEXT:    vmovdqa 464(%rdi), %xmm6
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm19 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm19 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm19, %xmm6, %xmm5
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %xmm6, %xmm24
 ; AVX512BW-SLOW-NEXT:    vmovdqa 448(%rdi), %xmm6
@@ -12052,28 +12217,28 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-SLOW-NEXT:    vmovdqa 400(%rdi), %xmm12
 ; AVX512BW-SLOW-NEXT:    vmovdqa 416(%rdi), %xmm13
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 432(%rdi), %xmm16
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm5, %xmm7, %xmm1
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %xmm25, %xmm10
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %xmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm5, %xmm25, %xmm2
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %xmm24, %xmm19
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm2, %xmm24, %xmm24
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm2, %xmm26, %xmm25
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3]
 ; AVX512BW-SLOW-NEXT:    vinserti32x4 $1, %xmm24, %ymm0, %ymm3
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm27 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm27 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm27, %xmm16, %xmm24
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %xmm16, %xmm22
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm27, %xmm13, %xmm25
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %xmm13, %xmm18
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3]
 ; AVX512BW-SLOW-NEXT:    vinserti32x4 $1, %xmm24, %ymm0, %ymm3
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm30 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm30 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm30, %xmm12, %xmm24
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %xmm12, %xmm28
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm30, %xmm4, %xmm25
@@ -12144,12 +12309,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512BW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
 ; AVX512BW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX512BW-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm1, %xmm0, %xmm2
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm1, %xmm10, %xmm3
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm3, %xmm19, %xmm4
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm3, %xmm26, %xmm5
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %xmm26, %xmm8
@@ -12158,13 +12323,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm4, %xmm22, %xmm5
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %xmm22, %xmm23
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm4, %xmm18, %xmm30
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %xmm18, %xmm22
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3]
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm30 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm30 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm30, %xmm28, %xmm0
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %xmm28, %xmm10
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %xmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -12232,26 +12397,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512BW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512BW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX512BW-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm0, %xmm14, %xmm1
 ; AVX512BW-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm0, %xmm15, %xmm3
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm3, %xmm19, %xmm4
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm3, %xmm8, %xmm5
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm4, %xmm23, %xmm5
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm4, %xmm22, %xmm26
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3]
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm26 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm26 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm26, %xmm10, %xmm30
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm26, %xmm18, %xmm2
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm30[0],xmm2[1],xmm30[1],xmm2[2],xmm30[2],xmm2[3],xmm30[3]
@@ -12314,13 +12479,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512BW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512BW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm0, %xmm14, %xmm1
 ; AVX512BW-SLOW-NEXT:    vmovdqa %xmm14, %xmm8
 ; AVX512BW-SLOW-NEXT:    vmovdqa %xmm15, %xmm14
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm0, %xmm15, %xmm2
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm2, %xmm19, %xmm4
 ; AVX512BW-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
@@ -12329,11 +12494,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm4, %xmm23, %xmm5
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm4, %xmm22, %xmm26
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3]
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm26 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm26 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm26, %xmm31, %xmm30
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm26, %xmm18, %xmm3
@@ -12390,11 +12555,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512BW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
 ; AVX512BW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm0, %xmm8, %xmm1
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm0, %xmm14, %xmm2
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm2, %xmm19, %xmm3
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm2, %xmm15, %xmm4
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %xmm15, %xmm19
@@ -12402,13 +12567,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX512BW-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm3, %xmm8, %xmm4
 ; AVX512BW-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm3, %xmm7, %xmm5
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm5, %xmm31, %xmm26
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm5, %xmm18, %xmm30
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm26 = xmm30[0],xmm26[0],xmm30[1],xmm26[1],xmm30[2],xmm26[2],xmm30[3],xmm26[3]
@@ -12465,13 +12630,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512BW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
 ; AVX512BW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX512BW-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
 ; AVX512BW-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm0, %xmm15, %xmm2
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm2, %xmm23, %xmm3
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm2, %xmm19, %xmm5
@@ -12479,11 +12644,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm3, %xmm8, %xmm5
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm3, %xmm7, %xmm6
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm6, %xmm31, %xmm26
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm6, %xmm18, %xmm30
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %xmm18, %xmm31
@@ -12538,23 +12703,23 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512BW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX512BW-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm1, %xmm15, %xmm3
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm3, %xmm23, %xmm4
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm3, %xmm19, %xmm5
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm4, %xmm8, %xmm5
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm4, %xmm7, %xmm6
 ; AVX512BW-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX512BW-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm6, %xmm7, %xmm7
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm6, %xmm31, %xmm9
@@ -12631,40 +12796,40 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FAST:       # %bb.0:
 ; AVX512BW-FAST-NEXT:    subq $328, %rsp # imm = 0x148
 ; AVX512BW-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm4
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
 ; AVX512BW-FAST-NEXT:    vmovdqa 480(%rdi), %ymm1
 ; AVX512BW-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512BW-FAST-NEXT:    vpermd %ymm1, %ymm0, %ymm30
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm2, %ymm30, %ymm1
 ; AVX512BW-FAST-NEXT:    vmovdqa %ymm2, %ymm9
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512BW-FAST-NEXT:    vmovdqa 448(%rdi), %ymm2
 ; AVX512BW-FAST-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
 ; AVX512BW-FAST-NEXT:    vpermd %ymm2, %ymm0, %ymm31
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm3, %ymm31, %ymm2
 ; AVX512BW-FAST-NEXT:    vmovdqa %ymm3, %ymm11
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512BW-FAST-NEXT:    vmovdqa 416(%rdi), %ymm2
 ; AVX512BW-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512BW-FAST-NEXT:    vpermd %ymm2, %ymm0, %ymm17
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm3, %ymm17, %ymm2
 ; AVX512BW-FAST-NEXT:    vmovdqa %ymm3, %ymm8
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512BW-FAST-NEXT:    vmovdqa64 384(%rdi), %ymm28
 ; AVX512BW-FAST-NEXT:    vpermd %ymm28, %ymm0, %ymm14
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm5, %ymm14, %ymm3
 ; AVX512BW-FAST-NEXT:    vmovdqa %ymm5, %ymm10
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm7 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX512BW-FAST-NEXT:    vmovdqa64 368(%rdi), %xmm20
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm7, %xmm20, %xmm2
 ; AVX512BW-FAST-NEXT:    vmovdqa64 352(%rdi), %xmm19
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm7, %xmm19, %xmm3
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} xmm24 = <u,u,u,u,0,8,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm24 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX512BW-FAST-NEXT:    vmovdqa64 336(%rdi), %xmm18
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm24, %xmm18, %xmm5
 ; AVX512BW-FAST-NEXT:    vmovdqa64 320(%rdi), %xmm29
@@ -12712,25 +12877,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512BW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm21, %zmm0
 ; AVX512BW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm8, %ymm30, %ymm0
 ; AVX512BW-FAST-NEXT:    vmovdqa %ymm8, %ymm5
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm8, %ymm31, %ymm13
 ; AVX512BW-FAST-NEXT:    vmovdqa %ymm8, %ymm7
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm8, %ymm17, %ymm13
 ; AVX512BW-FAST-NEXT:    vmovdqa %ymm8, %ymm6
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm8, %ymm14, %ymm15
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = <u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm13, %xmm20, %xmm15
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm13, %xmm19, %xmm24
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm24[0],xmm15[0],xmm24[1],xmm15[1],xmm24[2],xmm15[2],xmm24[3],xmm15[3]
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} xmm24 = <u,u,u,u,1,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm24 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm24, %xmm18, %xmm26
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm24, %xmm29, %xmm21
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm21[0],xmm26[0],xmm21[1],xmm26[1],xmm21[2],xmm26[2],xmm21[3],xmm26[3]
@@ -12760,25 +12925,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512BW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm12, %zmm0
 ; AVX512BW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm5, %ymm30, %ymm0
 ; AVX512BW-FAST-NEXT:    vmovdqa %ymm5, %ymm6
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm5, %ymm31, %ymm11
 ; AVX512BW-FAST-NEXT:    vmovdqa %ymm5, %ymm7
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5,6],ymm0[7]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm5, %ymm17, %ymm11
 ; AVX512BW-FAST-NEXT:    vmovdqa %ymm5, %ymm8
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm5, %ymm14, %ymm12
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,2,10,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm11, %xmm20, %xmm12
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm11, %xmm19, %xmm13
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = <u,u,u,u,2,10,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %xmm18, %xmm26
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm13, %xmm18, %xmm15
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm13, %xmm29, %xmm21
@@ -12809,24 +12974,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512BW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm12, %zmm0
 ; AVX512BW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm5, %ymm30, %ymm0
 ; AVX512BW-FAST-NEXT:    vmovdqa %ymm5, %ymm6
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm5, %ymm31, %ymm8
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5,6],ymm0[7]
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm31 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm31 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm31, %ymm17, %ymm8
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm30 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm30 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm30, %ymm14, %ymm11
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5],ymm11[6,7]
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,3,11,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm8, %xmm20, %xmm11
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %xmm19, %xmm24
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm8, %xmm19, %xmm12
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,u,u,3,11,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm12, %xmm18, %xmm14
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm12, %xmm29, %xmm15
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
@@ -12860,25 +13025,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7]
 ; AVX512BW-FAST-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload
 ; AVX512BW-FAST-NEXT:    vpermd (%rsp), %ymm3, %ymm5 # 32-byte Folded Reload
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm13, %ymm7, %ymm2
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm4, %ymm5, %ymm8
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512BW-FAST-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload
 ; AVX512BW-FAST-NEXT:    vpermd %ymm28, %ymm3, %ymm14
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm0, %ymm2, %ymm11
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm6, %ymm14, %ymm12
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm11, %xmm20, %xmm12
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %xmm20, %xmm16
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm11, %xmm19, %xmm15
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} xmm21 = <u,u,u,u,4,12,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm21 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm21, %xmm18, %xmm15
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm21, %xmm29, %xmm28
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3]
@@ -12911,22 +13076,22 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3]
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512BW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm12, %zmm28
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm4, %ymm7, %ymm8
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm6, %ymm5, %ymm11
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5,6],ymm8[7]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm0, %ymm2, %ymm11
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm1, %ymm14, %ymm12
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm11, %xmm16, %xmm12
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm11, %xmm24, %xmm13
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = <u,u,u,u,5,13,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm13, %xmm26, %xmm21
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm13, %xmm29, %xmm27
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm27[0],xmm21[0],xmm27[1],xmm21[1],xmm27[2],xmm21[2],xmm27[3],xmm21[3]
@@ -12956,22 +13121,22 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3]
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
 ; AVX512BW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm21
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm0, %ymm7, %ymm3
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm17 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm17 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm17, %ymm5, %ymm4
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm1, %ymm2, %ymm4
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm6, %ymm14, %ymm8
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5],ymm8[6,7]
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,6,14,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm4, %xmm16, %xmm8
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm4, %xmm24, %xmm11
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,u,u,6,14,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm11, %xmm26, %xmm12
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm11, %xmm29, %xmm13
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
@@ -13000,20 +13165,20 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3]
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512BW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm8, %zmm3
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm6, %ymm7, %ymm0
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm7, %ymm5, %ymm1
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm31, %ymm2, %ymm1
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm30, %ymm14, %ymm2
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
 ; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,7,15,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm1, %xmm16, %xmm2
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm1, %xmm24, %xmm4
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,7,15,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm4, %xmm26, %xmm8
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm4, %xmm29, %xmm11
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3]
@@ -13082,14 +13247,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; AVX: {{.*}}
 ; AVX2: {{.*}}
-; AVX512BW: {{.*}}
 ; AVX512BW-ONLY-FAST: {{.*}}
 ; AVX512BW-ONLY-SLOW: {{.*}}
 ; AVX512DQ-FAST: {{.*}}
 ; AVX512DQ-SLOW: {{.*}}
 ; AVX512DQBW-FAST: {{.*}}
 ; AVX512DQBW-SLOW: {{.*}}
-; AVX512F: {{.*}}
 ; AVX512F-ONLY-FAST: {{.*}}
 ; AVX512F-ONLY-SLOW: {{.*}}
 ; FALLBACK0: {{.*}}

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
index eb0ef5caaa0a1..8f160e2bafda0 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
@@ -549,7 +549,7 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqa (%rdx), %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa (%rsi), %xmm2
 ; AVX2-FAST-NEXT:    vmovdqa 16(%rsi), %xmm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
 ; AVX2-FAST-NEXT:    vpshufb %xmm4, %xmm2, %xmm5
 ; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm6
 ; AVX2-FAST-NEXT:    vmovdqa 16(%rdi), %xmm7
@@ -593,7 +593,7 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdx), %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rsi), %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 16(%rsi), %xmm3
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm4, %xmm2, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm6
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 16(%rdi), %xmm7
@@ -1043,7 +1043,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqa (%rdx), %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa (%rsi), %xmm1
 ; AVX2-FAST-NEXT:    vmovdqa 32(%rsi), %xmm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm4
 ; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm5
 ; AVX2-FAST-NEXT:    vmovdqa 32(%rdi), %xmm6
@@ -1120,7 +1120,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdx), %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rsi), %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%rsi), %xmm3
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm1, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %xmm6
@@ -2048,7 +2048,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqa 32(%rdx), %ymm9
 ; AVX2-FAST-NEXT:    vmovdqa 64(%rdx), %ymm2
 ; AVX2-FAST-NEXT:    vmovdqa (%rsi), %xmm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm3, %xmm4
 ; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm6
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2]
@@ -2191,7 +2191,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%rdx), %ymm9
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 64(%rdx), %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rsi), %xmm3
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm5, %xmm3, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm6
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2]

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
index 5ef699f087c32..92acf21cad010 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
@@ -2354,7 +2354,8 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm12, %ymm9, %ymm4, %ymm9
 ; AVX2-SLOW-NEXT:    vpbroadcastq 24(%r8), %ymm4
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm12, %ymm1, %ymm4, %ymm12
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25>
+; AVX2-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25]
+; AVX2-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm1, %ymm13, %ymm4
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm15 = ymm11[1,1,1,2,5,5,5,6]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5],ymm15[6],ymm4[7,8],ymm15[9],ymm4[10,11],ymm15[12],ymm4[13],ymm15[14],ymm4[15]
@@ -2495,13 +2496,14 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpbroadcastq 32(%r8), %ymm8
 ; AVX2-FAST-NEXT:    vpblendvb %ymm15, %ymm0, %ymm8, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29]
+; AVX2-FAST-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm0, %ymm4, %ymm11
 ; AVX2-FAST-NEXT:    vmovdqa %ymm3, %ymm7
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm12 = ymm3[2,3,2,3,6,7,6,7]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,3,2,2]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm12, %ymm10, %ymm13
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm14 = ymm9[3,2,3,3,7,6,7,7]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15]
@@ -2526,11 +2528,13 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendvb %ymm14, %ymm13, %ymm5, %ymm12
 ; AVX2-FAST-NEXT:    vpbroadcastq 24(%r8), %ymm5
 ; AVX2-FAST-NEXT:    vpblendvb %ymm14, %ymm0, %ymm5, %ymm13
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25]
+; AVX2-FAST-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm10, %ymm5
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm14 = ymm9[1,1,1,2,5,5,5,6]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3],ymm14[4],ymm5[5],ymm14[6],ymm5[7,8],ymm14[9],ymm5[10,11],ymm14[12],ymm5[13],ymm14[14],ymm5[15]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0]
+; AVX2-FAST-NEXT:    # ymm14 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm14, %ymm4, %ymm0
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm6 = ymm7[0,1,2,1,4,5,6,5]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3],ymm6[4],ymm0[5,6],ymm6[7],ymm0[8,9],ymm6[10],ymm0[11],ymm6[12],ymm0[13,14],ymm6[15]
@@ -2666,13 +2670,14 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq 32(%r8), %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm15, %ymm0, %ymm8, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29]
+; AVX2-FAST-PERLANE-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm0, %ymm4, %ymm11
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm3, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm12 = ymm3[2,3,2,3,6,7,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,3,2,2]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm12, %ymm10, %ymm13
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm14 = ymm9[3,2,3,3,7,6,7,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15]
@@ -2697,11 +2702,13 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm14, %ymm13, %ymm5, %ymm12
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq 24(%r8), %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm14, %ymm0, %ymm5, %ymm13
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25]
+; AVX2-FAST-PERLANE-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm3, %ymm10, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm14 = ymm9[1,1,1,2,5,5,5,6]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3],ymm14[4],ymm5[5],ymm14[6],ymm5[7,8],ymm14[9],ymm5[10,11],ymm14[12],ymm5[13],ymm14[14],ymm5[15]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0]
+; AVX2-FAST-PERLANE-NEXT:    # ymm14 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm14, %ymm4, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm6 = ymm7[0,1,2,1,4,5,6,5]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3],ymm6[4],ymm0[5,6],ymm6[7],ymm0[8,9],ymm6[10],ymm0[11],ymm6[12],ymm0[13,14],ymm6[15]
@@ -2877,7 +2884,8 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15]
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,3,2,2]
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm12, %zmm7, %zmm12
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25>
+; AVX512F-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25]
+; AVX512F-SLOW-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm7, %ymm6, %ymm13
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm14 = ymm4[1,1,1,2,5,5,5,6]
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5],ymm14[6],ymm13[7,8],ymm14[9],ymm13[10,11],ymm14[12],ymm13[13],ymm14[14],ymm13[15]
@@ -2984,16 +2992,18 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm0 = ymm12[0,1,2,1,4,5,6,5]
 ; AVX512F-FAST-NEXT:    vprolq $16, %ymm11, %ymm1
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29>
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29]
+; AVX512F-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm1, %ymm11, %ymm0
 ; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm12[2,3,2,3,6,7,6,7]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm12 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25>
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25]
+; AVX512F-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm13, %ymm0
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm2, %ymm23
 ; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm14[1,1,1,2,5,5,5,6]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm13, %ymm13
 ; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm14 = ymm14[3,2,3,3,7,6,7,7]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm14 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15]
@@ -4715,7 +4725,8 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm8[0,1,1,1]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm3, %ymm1, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25>
+; AVX2-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25]
+; AVX2-SLOW-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm8, %ymm11, %ymm1
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm10[1,1,1,2,5,5,5,6]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
@@ -5093,13 +5104,15 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm10[0,1,1,1]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25]
+; AVX2-FAST-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm0, %ymm12, %ymm1
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,2,5,5,5,6]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
 ; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm4
 ; AVX2-FAST-NEXT:    vmovdqa (%rsi), %ymm12
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0]
+; AVX2-FAST-NEXT:    # ymm13 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm13, %ymm12, %ymm2
 ; AVX2-FAST-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm3 = ymm4[0,1,2,1,4,5,6,5]
@@ -5175,11 +5188,12 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpbroadcastq 112(%r8), %ymm1
 ; AVX2-FAST-NEXT:    vpblendvb %ymm15, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29]
+; AVX2-FAST-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm0, %ymm12, %ymm1
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm7 = ymm14[2,3,2,3,6,7,6,7]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7,8],ymm7[9],ymm1[10],ymm7[11],ymm1[12,13],ymm7[14],ymm1[15]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm7, %ymm2, %ymm15
 ; AVX2-FAST-NEXT:    vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
@@ -5465,13 +5479,15 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm10[0,1,1,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25]
+; AVX2-FAST-PERLANE-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm0, %ymm12, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,2,5,5,5,6]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rsi), %ymm12
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0]
+; AVX2-FAST-PERLANE-NEXT:    # ymm13 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm13, %ymm12, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm3 = ymm4[0,1,2,1,4,5,6,5]
@@ -5547,11 +5563,12 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq 112(%r8), %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm15, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29]
+; AVX2-FAST-PERLANE-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm0, %ymm12, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm7 = ymm14[2,3,2,3,6,7,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7,8],ymm7[9],ymm1[10],ymm7[11],ymm1[12,13],ymm7[14],ymm1[15]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm7, %ymm2, %ymm15
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
@@ -5871,7 +5888,8 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm7 = ymm21[2,3,2,3,6,7,6,7]
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8],ymm7[9],ymm5[10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm5, %ymm19
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25>
+; AVX512F-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25]
+; AVX512F-SLOW-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512F-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm5, %ymm9, %ymm7
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm8 = ymm26[1,1,1,2,5,5,5,6]
@@ -6158,17 +6176,19 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vprolq $16, %ymm8, %ymm3
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3],ymm1[4],ymm3[5,6],ymm1[7],ymm3[8,9],ymm1[10],ymm3[11],ymm1[12],ymm3[13,14],ymm1[15]
 ; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29>
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29]
+; AVX512F-FAST-NEXT:    # ymm14 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm14, %ymm8, %ymm1
 ; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm3 = ymm23[2,3,2,3,6,7,6,7]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15]
 ; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25>
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25]
+; AVX512F-FAST-NEXT:    # ymm13 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm13, %ymm0, %ymm1
 ; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm3 = ymm20[1,1,1,2,5,5,5,6]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15]
 ; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
 ; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm1 = ymm20[3,2,3,3,7,6,7,7]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6,7,8],ymm1[9],ymm0[10],ymm1[11,12],ymm0[13,14,15]

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
index 71505f5912548..c20981d0d9398 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
@@ -1354,7 +1354,8 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendvb %ymm0, %ymm14, %ymm15, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <1,u,u,2,u,u,3,u>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [1,0,3,2,1,0,3,2]
+; AVX2-FAST-NEXT:    # ymm15 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm14, %ymm15, %ymm14
 ; AVX2-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[1,1,1,1]
@@ -1367,7 +1368,8 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm0, %ymm14, %ymm15, %ymm14
 ; AVX2-FAST-NEXT:    vpunpckhwd {{.*#+}} ymm15 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <5,u,u,6,u,u,7,u>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [5,0,7,6,5,0,7,6]
+; AVX2-FAST-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm15, %ymm7, %ymm7
 ; AVX2-FAST-NEXT:    vpunpckhwd {{.*#+}} ymm15 = ymm13[4],ymm1[4],ymm13[5],ymm1[5],ymm13[6],ymm1[6],ymm13[7],ymm1[7],ymm13[12],ymm1[12],ymm13[13],ymm1[13],ymm13[14],ymm1[14],ymm13[15],ymm1[15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3]
@@ -1393,7 +1395,8 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,4,u,u,5,u,u,6>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [5,4,0,6,5,4,0,6]
+; AVX2-FAST-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm1[0],ymm13[1],ymm1[1],ymm13[2],ymm1[2],ymm13[3],ymm1[3],ymm13[8],ymm1[8],ymm13[9],ymm1[9],ymm13[10],ymm1[10],ymm13[11],ymm1[11]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
@@ -2810,7 +2813,8 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
 ; AVX2-FAST-NEXT:    vmovdqa 32(%r9), %ymm2
 ; AVX2-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
+; AVX2-FAST-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm11, %ymm1, %ymm2, %ymm1
@@ -3085,7 +3089,8 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%r9), %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
+; AVX2-FAST-PERLANE-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm8, %ymm2, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm12, %ymm1, %ymm2, %ymm1
@@ -3653,7 +3658,8 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
 ; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm13[0],ymm1[1],ymm13[1],ymm1[2],ymm13[2],ymm1[3],ymm13[3],ymm1[8],ymm13[8],ymm1[9],ymm13[9],ymm1[10],ymm13[10],ymm1[11],ymm13[11]
 ; AVX512F-ONLY-FAST-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm10[4],ymm5[5],ymm10[5],ymm5[6],ymm10[6],ymm5[7],ymm10[7],ymm5[12],ymm10[12],ymm5[13],ymm10[13],ymm5[14],ymm10[14],ymm5[15],ymm10[15]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25>
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
+; AVX512F-ONLY-FAST-NEXT:    # ymm13 = mem[0,1,0,1]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm13, %ymm11, %ymm10
 ; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm26 = ymm10[2,2,2,3]
 ; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2]
@@ -4062,7 +4068,8 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [12,1,2,13,4,5,14,7]
 ; AVX512DQ-FAST-NEXT:    vpermt2d %ymm3, %ymm9, %ymm22
 ; AVX512DQ-FAST-NEXT:    vmovdqa 32(%r9), %ymm5
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25>
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
+; AVX512DQ-FAST-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm3, %ymm5, %ymm4
 ; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -6489,7 +6496,8 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
 ; AVX2-FAST-NEXT:    vmovdqa (%r9), %ymm2
 ; AVX2-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
+; AVX2-FAST-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm6, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm5, %ymm1, %ymm2, %ymm1
@@ -7207,7 +7215,8 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    # ymm7 = mem[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0],ymm1[1,2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
+; AVX2-FAST-PERLANE-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm7, %ymm8, %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
@@ -8302,7 +8311,8 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm12, %zmm2, %zmm20
 ; AVX512F-ONLY-FAST-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm8[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2d %ymm12, %ymm6, %ymm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25>
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
+; AVX512F-ONLY-FAST-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm6, %ymm0, %ymm8
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm8, %ymm24
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31>
@@ -9033,7 +9043,8 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vpermt2d %ymm6, %ymm19, %ymm21
 ; AVX512DQ-FAST-NEXT:    vmovdqa 96(%r9), %ymm2
 ; AVX512DQ-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25>
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
+; AVX512DQ-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm5, %ymm2, %ymm6
 ; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm3 = ymm6[2,2,2,3]
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
index 8cd6e2c38b429..da36f165bd4c5 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
@@ -4067,7 +4067,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm14, %ymm5, %ymm6, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9>
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm4, %xmm5
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm13[1,1,2,2]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
@@ -4327,7 +4327,8 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vmovdqa 32(%rdi), %ymm6
 ; AVX2-FAST-NEXT:    vmovdqa 32(%rsi), %ymm9
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX2-FAST-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm0, %ymm9, %ymm1
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm6[1,1,1,1,5,5,5,5]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -4354,7 +4355,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendvb %ymm15, %ymm0, %ymm1, %ymm15
 ; AVX2-FAST-NEXT:    vmovdqa 32(%r8), %ymm5
 ; AVX2-FAST-NEXT:    vmovdqa 32(%r9), %ymm7
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
 ; AVX2-FAST-NEXT:    vpshufb %ymm4, %ymm7, %ymm3
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm10 = ymm5[0,0,2,1,4,4,6,5]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm10[3],ymm3[4,5],ymm10[6],ymm3[7,8,9,10],ymm10[11],ymm3[12,13],ymm10[14],ymm3[15]
@@ -4519,12 +4520,12 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqu %ymm1, (%rsp) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpblendvb %ymm14, %ymm2, %ymm8, %ymm1
 ; AVX2-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm7, %xmm8
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm13 = xmm5[1,1,2,2]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm8 = xmm13[0],xmm8[1],xmm13[2,3],xmm8[4],xmm13[5,6],xmm8[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = <u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm13 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm13, %xmm3, %xmm14
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
@@ -4791,11 +4792,11 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm9, %ymm5, %ymm7, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm1, (%rsp) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm14, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm7 = xmm13[1,1,2,2]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5,6],xmm5[7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} xmm7 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm7, %xmm2, %xmm9
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm15 = xmm2[1,1,2,3]
@@ -4908,7 +4909,8 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX2-FAST-PERLANE-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm4, %ymm5, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
@@ -4932,7 +4934,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm7, %ymm4, %ymm6, %ymm4
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm6, %ymm7, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
@@ -5142,7 +5144,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7]
 ; AVX512F-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
@@ -5330,325 +5332,651 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vzeroupper
 ; AVX512F-SLOW-NEXT:    retq
 ;
-; AVX512F-FAST-LABEL: store_i16_stride7_vf32:
-; AVX512F-FAST:       # %bb.0:
-; AVX512F-FAST-NEXT:    subq $712, %rsp # imm = 0x2C8
-; AVX512F-FAST-NEXT:    vmovdqa (%rcx), %ymm1
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512F-FAST-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
-; AVX512F-FAST-NEXT:    vmovdqa (%rdx), %ymm2
-; AVX512F-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u>
-; AVX512F-FAST-NEXT:    vpshufb %ymm15, %ymm2, %ymm2
-; AVX512F-FAST-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa (%rsi), %ymm1
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa 32(%rsi), %ymm6
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512F-FAST-NEXT:    vpshufb %ymm12, %ymm1, %ymm1
-; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX512F-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19>
-; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm3, %ymm3
-; AVX512F-FAST-NEXT:    vpor %ymm1, %ymm3, %ymm1
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa (%r9), %ymm9
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512F-FAST-NEXT:    vpshufb %ymm1, %ymm9, %ymm3
-; AVX512F-FAST-NEXT:    vmovdqa (%r8), %ymm10
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u>
-; AVX512F-FAST-NEXT:    vpshufb %ymm8, %ymm10, %ymm4
-; AVX512F-FAST-NEXT:    vporq %ymm3, %ymm4, %ymm19
-; AVX512F-FAST-NEXT:    vmovdqa 32(%r9), %xmm3
-; AVX512F-FAST-NEXT:    vmovdqa 32(%r8), %xmm7
-; AVX512F-FAST-NEXT:    vpshufb %ymm12, %ymm6, %ymm4
-; AVX512F-FAST-NEXT:    vmovdqa 32(%rdi), %ymm12
-; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm12, %ymm2
-; AVX512F-FAST-NEXT:    vpor %ymm4, %ymm2, %ymm2
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa 32(%rcx), %ymm13
-; AVX512F-FAST-NEXT:    vpshufb %ymm0, %ymm13, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqa 32(%rdx), %ymm14
-; AVX512F-FAST-NEXT:    vpshufb %ymm15, %ymm14, %ymm2
-; AVX512F-FAST-NEXT:    vpor %ymm0, %ymm2, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa 32(%r9), %ymm15
-; AVX512F-FAST-NEXT:    vpshufb %ymm1, %ymm15, %ymm1
-; AVX512F-FAST-NEXT:    vmovdqa 32(%r8), %ymm0
-; AVX512F-FAST-NEXT:    vpshufb %ymm8, %ymm0, %ymm2
-; AVX512F-FAST-NEXT:    vporq %ymm1, %ymm2, %ymm17
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512F-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
-; AVX512F-FAST-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
-; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,3,8,8,9,9]
-; AVX512F-FAST-NEXT:    vpshufb %xmm3, %xmm7, %xmm2
-; AVX512F-FAST-NEXT:    vpermi2q %zmm1, %zmm2, %zmm4
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa (%r9), %xmm1
-; AVX512F-FAST-NEXT:    vmovdqa (%r8), %xmm2
-; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-FAST-NEXT:    vpshufb %xmm3, %xmm4, %xmm5
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
-; AVX512F-FAST-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,0,0,1,8,9,9,11]
-; AVX512F-FAST-NEXT:    vpermi2q %zmm5, %zmm4, %zmm11
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm19, %zmm0, %zmm2
-; AVX512F-FAST-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,0,1,1,12,13,u,15>
-; AVX512F-FAST-NEXT:    vpermi2q %zmm2, %zmm1, %zmm4
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31>
-; AVX512F-FAST-NEXT:    vpshufb %xmm3, %xmm7, %xmm1
-; AVX512F-FAST-NEXT:    vpshufb %ymm8, %ymm9, %ymm2
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm3 = ymm10[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
-; AVX512F-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [2,2,2,3,8,8,8,9]
-; AVX512F-FAST-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vprold $16, %ymm9, %ymm1
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm10[1,2,2,3,5,6,6,7]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21>
-; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm9, %ymm1
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm3 = ymm10[0,0,2,1,4,4,6,5]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15]
-; AVX512F-FAST-NEXT:    vmovdqa (%rax), %ymm11
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,10,9,11,10]
-; AVX512F-FAST-NEXT:    vpermi2q %zmm2, %zmm3, %zmm1
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm11, %ymm3
-; AVX512F-FAST-NEXT:    vpbroadcastd 8(%rax), %ymm4
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512F-FAST-NEXT:    vpandn %ymm4, %ymm7, %ymm4
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm28
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm4 = ymm12[1,1,1,1,5,5,5,5]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,5,7]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7,8,9],ymm3[10],ymm7[11,12],ymm3[13],ymm7[14,15]
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm1, %ymm30
-; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm15, %ymm5
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm7 = ymm0[0,0,2,1,4,4,6,5]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15]
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <0,1,u,3,10,10,11,11>
-; AVX512F-FAST-NEXT:    vpermi2q %zmm5, %zmm17, %zmm25
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,4,u,u,u,5,u>
-; AVX512F-FAST-NEXT:    vmovdqa 32(%rax), %ymm9
-; AVX512F-FAST-NEXT:    vpermd %ymm9, %ymm5, %ymm5
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
-; AVX512F-FAST-NEXT:    vpandn %ymm5, %ymm7, %ymm5
-; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm9, %ymm2
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm5, %zmm2, %zmm17
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = ymm6[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm7 = ymm12[2,2,2,2,6,6,6,6]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6,7,8],ymm2[9],ymm7[10,11],ymm2[12],ymm7[13,14,15]
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm1, %ymm31
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm6 = ymm12[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm2[3],ymm6[4,5],ymm2[6],ymm6[7,8,9,10],ymm2[11],ymm6[12,13],ymm2[14],ymm6[15]
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, (%rsp) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm12 = ymm14[2,2,2,2,6,6,6,6]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7,8,9],ymm12[10],ymm2[11,12],ymm12[13],ymm2[14,15]
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u>
-; AVX512F-FAST-NEXT:    vpshufb %ymm7, %ymm13, %ymm2
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm13 = ymm14[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm13[0],ymm2[1],ymm13[2,3],ymm2[4],ymm13[5,6,7,8],ymm2[9],ymm13[10,11],ymm2[12],ymm13[13,14,15]
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm1, %ymm24
-; AVX512F-FAST-NEXT:    vprold $16, %ymm15, %ymm2
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm14 = ymm0[1,2,2,3,5,6,6,7]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7,8,9],ymm2[10],ymm14[11,12],ymm2[13],ymm14[14,15]
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb %ymm8, %ymm15, %ymm2
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15]
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm0, %ymm22
-; AVX512F-FAST-NEXT:    vmovdqa64 (%rax), %zmm27
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm29 = <u,5,u,u,u,6,u,u,30,u,u,u,31,u,u,31>
-; AVX512F-FAST-NEXT:    vpermi2d %zmm27, %zmm9, %zmm29
-; AVX512F-FAST-NEXT:    vmovdqa 32(%rdi), %xmm9
-; AVX512F-FAST-NEXT:    vmovdqa 32(%rsi), %xmm10
-; AVX512F-FAST-NEXT:    vprold $16, %xmm10, %xmm2
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} xmm8 = xmm9[1,1,2,3]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm2[2],xmm8[3,4],xmm2[5],xmm8[6,7]
-; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX512F-FAST-NEXT:    vmovdqa (%rsi), %xmm0
-; AVX512F-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
-; AVX512F-FAST-NEXT:    vpshufb %xmm3, %xmm2, %xmm6
-; AVX512F-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
-; AVX512F-FAST-NEXT:    vpshufb %xmm3, %xmm5, %xmm2
-; AVX512F-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa 32(%rcx), %xmm5
-; AVX512F-FAST-NEXT:    vmovdqa 32(%rdx), %xmm2
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9>
-; AVX512F-FAST-NEXT:    vpshufb %xmm4, %xmm5, %xmm3
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} xmm8 = xmm2[1,1,2,2]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3],xmm3[4],xmm8[5,6],xmm3[7]
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm3, %ymm19
-; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512F-FAST-NEXT:    vprold $16, %xmm0, %xmm0
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm0, %ymm21
-; AVX512F-FAST-NEXT:    vmovdqa (%rcx), %xmm1
-; AVX512F-FAST-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
-; AVX512F-FAST-NEXT:    vmovdqa (%rdx), %xmm0
-; AVX512F-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6],xmm4[7]
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm0, %ymm23
-; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512F-FAST-NEXT:    vpshufb %ymm7, %ymm0, %ymm4
-; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm4 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5,6,7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13,14,15]
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm4, %ymm26
-; AVX512F-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
-; AVX512F-FAST-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm1, %ymm18
-; AVX512F-FAST-NEXT:    vpshufb %xmm5, %xmm2, %xmm1
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm1, %ymm20
-; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
-; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm15 = ymm2[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm4 = ymm15[0,1,2],ymm5[3],ymm15[4,5],ymm5[6],ymm15[7,8,9,10],ymm5[11],ymm15[12,13],ymm5[14],ymm15[15]
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm4, %ymm16
-; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512F-FAST-NEXT:    vpshufb %xmm4, %xmm8, %xmm8
-; AVX512F-FAST-NEXT:    vpshufb %xmm4, %xmm9, %xmm9
-; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm15 = ymm7[0,1,1,3,4,5,5,7]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm10 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7,8,9],ymm10[10],ymm15[11,12],ymm10[13],ymm15[14,15]
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm13 = ymm7[2,2,2,2,6,6,6,6]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7,8,9],ymm13[10],ymm15[11,12],ymm13[13],ymm15[14,15]
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm7 = ymm2[1,1,1,1,5,5,5,5]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm7[2],ymm15[3,4],ymm7[5],ymm15[6,7,8,9],ymm7[10],ymm15[11,12],ymm7[13],ymm15[14,15]
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm15 = ymm1[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm12 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5,6,7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13,14,15]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <6,u,u,u,7,u,u,7>
-; AVX512F-FAST-NEXT:    vpermd %ymm11, %ymm15, %ymm15
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm11, %zmm27, %zmm11
-; AVX512F-FAST-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3]
-; AVX512F-FAST-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm27 = mem[2,2,2,3]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm30 = ymm30[2,1,3,2]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm31 = ymm31[0,2,2,3]
-; AVX512F-FAST-NEXT:    vpermq $246, (%rsp), %ymm2 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm2 = mem[2,1,3,3]
-; AVX512F-FAST-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm1 = mem[2,2,2,3]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm24[0,2,2,3]
-; AVX512F-FAST-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm5 = mem[2,1,3,2]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm4 = ymm22[2,2,2,3]
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512F-FAST-NEXT:    vpternlogq $226, %zmm3, %zmm24, %zmm6
-; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-FAST-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm28
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm28
-; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm27, %zmm3, %zmm3
-; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm30, %zmm6, %zmm6
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6
-; AVX512F-FAST-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm17
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm17
-; AVX512F-FAST-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm3 = mem[0,0,2,1]
-; AVX512F-FAST-NEXT:    vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm6 = mem[0,0,1,3]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm19 = ymm19[0,0,1,1]
-; AVX512F-FAST-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[0,2,3,3,4,5,6,7]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm21 = ymm21[0,0,2,1]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm22 = ymm18[0,1,1,3]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm23 = ymm23[0,0,1,1]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm25 = ymm26[0,2,2,3]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm27 = ymm20[0,1,1,3]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm30 = ymm16[2,1,3,3]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3]
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm2, %zmm31, %zmm2
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512F-FAST-NEXT:    vpternlogq $226, %zmm2, %zmm1, %zmm0
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm2
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm29
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm6, %zmm3, %zmm0
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm14, %zmm19, %zmm2
-; AVX512F-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm24, %zmm2
-; AVX512F-FAST-NEXT:    vpbroadcastd 36(%rax), %ymm0
-; AVX512F-FAST-NEXT:    vpbroadcastd 40(%rax), %ymm3
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm21, %zmm8, %zmm2
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm23, %zmm22, %zmm3
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm27, %zmm25, %zmm2
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm9, %zmm30, %zmm4
-; AVX512F-FAST-NEXT:    vpternlogq $226, %zmm2, %zmm1, %zmm4
-; AVX512F-FAST-NEXT:    vpbroadcastd (%rax), %ymm1
-; AVX512F-FAST-NEXT:    vpbroadcastd 4(%rax), %ymm2
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1
-; AVX512F-FAST-NEXT:    vpbroadcastd 32(%rax), %ymm2
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm2, %zmm15, %zmm2
-; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm13, %zmm10, %zmm3
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm12, %zmm7, %zmm4
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <u,u,4,u,u,u,5,u,u,13,u,u,u,14,u,u>
-; AVX512F-FAST-NEXT:    vpermd %zmm11, %zmm3, %zmm3
-; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3
-; AVX512F-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-FAST-NEXT:    vmovdqa64 %zmm17, 320(%rax)
-; AVX512F-FAST-NEXT:    vmovdqa64 %zmm3, 128(%rax)
-; AVX512F-FAST-NEXT:    vmovdqa64 %zmm2, 192(%rax)
-; AVX512F-FAST-NEXT:    vmovdqa64 %zmm28, 64(%rax)
-; AVX512F-FAST-NEXT:    vmovdqa64 %zmm1, (%rax)
-; AVX512F-FAST-NEXT:    vmovdqa64 %zmm0, 256(%rax)
-; AVX512F-FAST-NEXT:    vmovdqa64 %zmm29, 384(%rax)
-; AVX512F-FAST-NEXT:    addq $712, %rsp # imm = 0x2C8
-; AVX512F-FAST-NEXT:    vzeroupper
-; AVX512F-FAST-NEXT:    retq
+; AVX512F-ONLY-FAST-LABEL: store_i16_stride7_vf32:
+; AVX512F-ONLY-FAST:       # %bb.0:
+; AVX512F-ONLY-FAST-NEXT:    subq $712, %rsp # imm = 0x2C8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rcx), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdx), %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm15, %ymm2, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rsi), %ymm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm12, %ymm1, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdi), %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19>
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm2, %ymm3, %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%r9), %ymm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm1, %ymm9, %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%r8), %ymm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm8, %ymm10, %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vporq %ymm3, %ymm4, %ymm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%r9), %xmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%r8), %xmm7
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm12, %ymm6, %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rdi), %ymm12
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm2, %ymm12, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vpor %ymm4, %ymm2, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rcx), %ymm13
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm0, %ymm13, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rdx), %ymm14
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm15, %ymm14, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%r9), %ymm15
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm1, %ymm15, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%r8), %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm8, %ymm0, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vporq %ymm1, %ymm2, %ymm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,3,8,8,9,9]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm3, %xmm7, %xmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%r9), %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%r8), %xmm2
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm3, %xmm4, %xmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,0,0,1,8,9,9,11]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm5, %zmm4, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm19, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,0,1,1,12,13,u,15>
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm1, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512F-ONLY-FAST-NEXT:    # ymm8 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm3, %xmm7, %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm8, %ymm9, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm3 = ymm10[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
+; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [2,2,2,3,8,8,8,9]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vprold $16, %ymm9, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm10[1,2,2,3,5,6,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm5, %ymm9, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm3 = ymm10[0,0,2,1,4,4,6,5]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rax), %ymm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,10,9,11,10]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm3, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm2, %ymm11, %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd 8(%rax), %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512F-ONLY-FAST-NEXT:    vpandn %ymm4, %ymm7, %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm4 = ymm12[1,1,1,1,5,5,5,5]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,5,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7,8,9],ymm3[10],ymm7[11,12],ymm3[13],ymm7[14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm1, %ymm30
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm5, %ymm15, %ymm5
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm7 = ymm0[0,0,2,1,4,4,6,5]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <0,1,u,3,10,10,11,11>
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm5, %zmm17, %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,4,u,u,u,5,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rax), %ymm9
+; AVX512F-ONLY-FAST-NEXT:    vpermd %ymm9, %ymm5, %ymm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
+; AVX512F-ONLY-FAST-NEXT:    vpandn %ymm5, %ymm7, %ymm5
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm2, %ymm9, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm5, %zmm2, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = ymm6[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm7 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6,7,8],ymm2[9],ymm7[10,11],ymm2[12],ymm7[13,14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm1, %ymm31
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm6 = ymm12[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm2[3],ymm6[4,5],ymm2[6],ymm6[7,8,9,10],ymm2[11],ymm6[12,13],ymm2[14],ymm6[15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, (%rsp) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm12 = ymm14[2,2,2,2,6,6,6,6]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7,8,9],ymm12[10],ymm2[11,12],ymm12[13],ymm2[14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm7, %ymm13, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm13 = ymm14[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm13[0],ymm2[1],ymm13[2,3],ymm2[4],ymm13[5,6,7,8],ymm2[9],ymm13[10,11],ymm2[12],ymm13[13,14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm1, %ymm24
+; AVX512F-ONLY-FAST-NEXT:    vprold $16, %ymm15, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm14 = ymm0[1,2,2,3,5,6,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7,8,9],ymm2[10],ymm14[11,12],ymm2[13],ymm14[14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm8, %ymm15, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm0, %ymm22
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rax), %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31]
+; AVX512F-ONLY-FAST-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm27, %zmm9, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rdi), %xmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rsi), %xmm10
+; AVX512F-ONLY-FAST-NEXT:    vprold $16, %xmm10, %xmm2
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} xmm8 = xmm9[1,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm2[2],xmm8[3,4],xmm2[5],xmm8[6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm3, %xmm2, %xmm6
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm3, %xmm5, %xmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rcx), %xmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rdx), %xmm2
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm4, %xmm5, %xmm3
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} xmm8 = xmm2[1,1,2,2]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3],xmm3[4],xmm8[5,6],xmm3[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm3, %ymm19
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512F-ONLY-FAST-NEXT:    vprold $16, %xmm0, %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm0, %ymm21
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rcx), %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6],xmm4[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm0, %ymm23
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm7, %ymm0, %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm4 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5,6,7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13,14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm4, %ymm26
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm1, %ymm18
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm5, %xmm2, %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm1, %ymm20
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm15 = ymm2[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm4 = ymm15[0,1,2],ymm5[3],ymm15[4,5],ymm5[6],ymm15[7,8,9,10],ymm5[11],ymm15[12,13],ymm5[14],ymm15[15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm4, %ymm16
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm4, %xmm8, %xmm8
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm4, %xmm9, %xmm9
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm15 = ymm7[0,1,1,3,4,5,5,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm10 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7,8,9],ymm10[10],ymm15[11,12],ymm10[13],ymm15[14,15]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm13 = ymm7[2,2,2,2,6,6,6,6]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7,8,9],ymm13[10],ymm15[11,12],ymm13[13],ymm15[14,15]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm7 = ymm2[1,1,1,1,5,5,5,5]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm7[2],ymm15[3,4],ymm7[5],ymm15[6,7,8,9],ymm7[10],ymm15[11,12],ymm7[13],ymm15[14,15]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm15 = ymm1[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm12 = ymm2[2,2,2,2,6,6,6,6]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5,6,7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13,14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <6,u,u,u,7,u,u,7>
+; AVX512F-ONLY-FAST-NEXT:    vpermd %ymm11, %ymm15, %ymm15
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm11, %zmm27, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm27 = mem[2,2,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm30 = ymm30[2,1,3,2]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm31 = ymm31[0,2,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq $246, (%rsp), %ymm2 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm2 = mem[2,1,3,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm1 = mem[2,2,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm24[0,2,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm5 = mem[2,1,3,2]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm4 = ymm22[2,2,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $226, %zmm3, %zmm24, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm27, %zmm3, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm30, %zmm6, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm3 = mem[0,0,2,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm6 = mem[0,0,1,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm19 = ymm19[0,0,1,1]
+; AVX512F-ONLY-FAST-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[0,2,3,3,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm21 = ymm21[0,0,2,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm22 = ymm18[0,1,1,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm23 = ymm23[0,0,1,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm25 = ymm26[0,2,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm27 = ymm20[0,1,1,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm30 = ymm16[2,1,3,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm2, %zmm31, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $226, %zmm2, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm6, %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm14, %zmm19, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm24, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd 36(%rax), %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd 40(%rax), %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm21, %zmm8, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm23, %zmm22, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm27, %zmm25, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm9, %zmm30, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $226, %zmm2, %zmm1, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd (%rax), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd 4(%rax), %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd 32(%rax), %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm2, %zmm15, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm13, %zmm10, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm12, %zmm7, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0]
+; AVX512F-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm11, %zmm3, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, 320(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 256(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, 384(%rax)
+; AVX512F-ONLY-FAST-NEXT:    addq $712, %rsp # imm = 0x2C8
+; AVX512F-ONLY-FAST-NEXT:    vzeroupper
+; AVX512F-ONLY-FAST-NEXT:    retq
+;
+; AVX512DQ-FAST-LABEL: store_i16_stride7_vf32:
+; AVX512DQ-FAST:       # %bb.0:
+; AVX512DQ-FAST-NEXT:    subq $712, %rsp # imm = 0x2C8
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rcx), %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdx), %ymm2
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm15, %ymm2, %ymm2
+; AVX512DQ-FAST-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rsi), %ymm6
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm12, %ymm1, %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %ymm3
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19>
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm2, %ymm3, %ymm3
+; AVX512DQ-FAST-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa (%r9), %ymm9
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm1, %ymm9, %ymm3
+; AVX512DQ-FAST-NEXT:    vmovdqa (%r8), %ymm10
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm8, %ymm10, %ymm4
+; AVX512DQ-FAST-NEXT:    vporq %ymm3, %ymm4, %ymm19
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%r9), %xmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%r8), %xmm7
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm12, %ymm6, %ymm4
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rdi), %ymm12
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm2, %ymm12, %ymm2
+; AVX512DQ-FAST-NEXT:    vpor %ymm4, %ymm2, %ymm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rcx), %ymm13
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm0, %ymm13, %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rdx), %ymm14
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm15, %ymm14, %ymm2
+; AVX512DQ-FAST-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%r9), %ymm15
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm1, %ymm15, %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%r8), %ymm0
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm8, %ymm0, %ymm2
+; AVX512DQ-FAST-NEXT:    vporq %ymm1, %ymm2, %ymm17
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
+; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,3,8,8,9,9]
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm3, %xmm7, %xmm2
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm2, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa (%r9), %xmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa (%r8), %xmm2
+; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm3, %xmm4, %xmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,0,0,1,8,9,9,11]
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm5, %zmm4, %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm19, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,0,1,1,12,13,u,15>
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm2, %zmm1, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512DQ-FAST-NEXT:    # ymm8 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm3, %xmm7, %xmm1
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm8, %ymm9, %ymm2
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm3 = ymm10[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
+; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [2,2,2,3,8,8,8,9]
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vprold $16, %ymm9, %ymm1
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm10[1,2,2,3,5,6,6,7]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
+; AVX512DQ-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm5, %ymm9, %ymm1
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm3 = ymm10[0,0,2,1,4,4,6,5]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15]
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rax), %ymm11
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,10,9,11,10]
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm2, %zmm3, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm2, %ymm11, %ymm3
+; AVX512DQ-FAST-NEXT:    vpbroadcastd 8(%rax), %ymm4
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FAST-NEXT:    vpandn %ymm4, %ymm7, %ymm4
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm28
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm4 = ymm12[1,1,1,1,5,5,5,5]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,5,7]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7,8,9],ymm3[10],ymm7[11,12],ymm3[13],ymm7[14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm1, %ymm30
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm5, %ymm15, %ymm5
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm7 = ymm0[0,0,2,1,4,4,6,5]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <0,1,u,3,10,10,11,11>
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm5, %zmm17, %zmm25
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,4,u,u,u,5,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rax), %ymm9
+; AVX512DQ-FAST-NEXT:    vpermd %ymm9, %ymm5, %ymm5
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
+; AVX512DQ-FAST-NEXT:    vpandn %ymm5, %ymm7, %ymm5
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm2, %ymm9, %ymm2
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm5, %zmm2, %zmm17
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = ymm6[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm7 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6,7,8],ymm2[9],ymm7[10,11],ymm2[12],ymm7[13,14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm1, %ymm31
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm6 = ymm12[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm2[3],ymm6[4,5],ymm2[6],ymm6[7,8,9,10],ymm2[11],ymm6[12,13],ymm2[14],ymm6[15]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, (%rsp) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm12 = ymm14[2,2,2,2,6,6,6,6]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7,8,9],ymm12[10],ymm2[11,12],ymm12[13],ymm2[14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm7, %ymm13, %ymm2
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm13 = ymm14[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm13[0],ymm2[1],ymm13[2,3],ymm2[4],ymm13[5,6,7,8],ymm2[9],ymm13[10,11],ymm2[12],ymm13[13,14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm1, %ymm24
+; AVX512DQ-FAST-NEXT:    vprold $16, %ymm15, %ymm2
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm14 = ymm0[1,2,2,3,5,6,6,7]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7,8,9],ymm2[10],ymm14[11,12],ymm2[13],ymm14[14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm8, %ymm15, %ymm2
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm0, %ymm22
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rax), %zmm27
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x8 {{.*#+}} zmm29 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31]
+; AVX512DQ-FAST-NEXT:    # zmm29 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpermi2d %zmm27, %zmm9, %zmm29
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rdi), %xmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rsi), %xmm10
+; AVX512DQ-FAST-NEXT:    vprold $16, %xmm10, %xmm2
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} xmm8 = xmm9[1,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm2[2],xmm8[3,4],xmm2[5],xmm8[6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX512DQ-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm3, %xmm2, %xmm6
+; AVX512DQ-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm3, %xmm5, %xmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rcx), %xmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rdx), %xmm2
+; AVX512DQ-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm4, %xmm5, %xmm3
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} xmm8 = xmm2[1,1,2,2]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3],xmm3[4],xmm8[5,6],xmm3[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm3, %ymm19
+; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FAST-NEXT:    vprold $16, %xmm0, %xmm0
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm0, %ymm21
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rcx), %xmm1
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdx), %xmm0
+; AVX512DQ-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6],xmm4[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm0, %ymm23
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm7, %ymm0, %ymm4
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm4 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5,6,7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13,14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm4, %ymm26
+; AVX512DQ-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
+; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm1, %ymm18
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm5, %xmm2, %xmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm1, %ymm20
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm15 = ymm2[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm4 = ymm15[0,1,2],ymm5[3],ymm15[4,5],ymm5[6],ymm15[7,8,9,10],ymm5[11],ymm15[12,13],ymm5[14],ymm15[15]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm4, %ymm16
+; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm4, %xmm8, %xmm8
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm4, %xmm9, %xmm9
+; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm15 = ymm7[0,1,1,3,4,5,5,7]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm10 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7,8,9],ymm10[10],ymm15[11,12],ymm10[13],ymm15[14,15]
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm13 = ymm7[2,2,2,2,6,6,6,6]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7,8,9],ymm13[10],ymm15[11,12],ymm13[13],ymm15[14,15]
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm7 = ymm2[1,1,1,1,5,5,5,5]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm7[2],ymm15[3,4],ymm7[5],ymm15[6,7,8,9],ymm7[10],ymm15[11,12],ymm7[13],ymm15[14,15]
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm15 = ymm1[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm12 = ymm2[2,2,2,2,6,6,6,6]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5,6,7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13,14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <6,u,u,u,7,u,u,7>
+; AVX512DQ-FAST-NEXT:    vpermd %ymm11, %ymm15, %ymm15
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm11, %zmm27, %zmm11
+; AVX512DQ-FAST-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3]
+; AVX512DQ-FAST-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm27 = mem[2,2,2,3]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm30 = ymm30[2,1,3,2]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm31 = ymm31[0,2,2,3]
+; AVX512DQ-FAST-NEXT:    vpermq $246, (%rsp), %ymm2 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm2 = mem[2,1,3,3]
+; AVX512DQ-FAST-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm1 = mem[2,2,2,3]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm24[0,2,2,3]
+; AVX512DQ-FAST-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm5 = mem[2,1,3,2]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm4 = ymm22[2,2,2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512DQ-FAST-NEXT:    vpternlogq $226, %zmm3, %zmm24, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm28
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm28
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm27, %zmm3, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm30, %zmm6, %zmm6
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6
+; AVX512DQ-FAST-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm17
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm17
+; AVX512DQ-FAST-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm3 = mem[0,0,2,1]
+; AVX512DQ-FAST-NEXT:    vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm6 = mem[0,0,1,3]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm19 = ymm19[0,0,1,1]
+; AVX512DQ-FAST-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[0,2,3,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm21 = ymm21[0,0,2,1]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm22 = ymm18[0,1,1,3]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm23 = ymm23[0,0,1,1]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm25 = ymm26[0,2,2,3]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm27 = ymm20[0,1,1,3]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm30 = ymm16[2,1,3,3]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm2, %zmm31, %zmm2
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512DQ-FAST-NEXT:    vpternlogq $226, %zmm2, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm2
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm29
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm6, %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm14, %zmm19, %zmm2
+; AVX512DQ-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm24, %zmm2
+; AVX512DQ-FAST-NEXT:    vpbroadcastd 36(%rax), %ymm0
+; AVX512DQ-FAST-NEXT:    vpbroadcastd 40(%rax), %ymm3
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm21, %zmm8, %zmm2
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm23, %zmm22, %zmm3
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm27, %zmm25, %zmm2
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm9, %zmm30, %zmm4
+; AVX512DQ-FAST-NEXT:    vpternlogq $226, %zmm2, %zmm1, %zmm4
+; AVX512DQ-FAST-NEXT:    vpbroadcastd (%rax), %ymm1
+; AVX512DQ-FAST-NEXT:    vpbroadcastd 4(%rax), %ymm2
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1
+; AVX512DQ-FAST-NEXT:    vpbroadcastd 32(%rax), %ymm2
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm2, %zmm15, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm13, %zmm10, %zmm3
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm12, %zmm7, %zmm4
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x8 {{.*#+}} zmm3 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0]
+; AVX512DQ-FAST-NEXT:    # zmm3 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpermd %zmm11, %zmm3, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3
+; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, 320(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, 128(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, 192(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, 64(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, 256(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, 384(%rax)
+; AVX512DQ-FAST-NEXT:    addq $712, %rsp # imm = 0x2C8
+; AVX512DQ-FAST-NEXT:    vzeroupper
+; AVX512DQ-FAST-NEXT:    retq
 ;
 ; AVX512BW-LABEL: store_i16_stride7_vf32:
 ; AVX512BW:       # %bb.0:
@@ -8665,7 +8993,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm14, %ymm8, %ymm2, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9>
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm13[1,1,2,2]
@@ -8953,7 +9281,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm11, %ymm7, %ymm10, %ymm7
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27>
+; AVX2-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX2-SLOW-NEXT:    # ymm11 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %ymm11, %ymm7, %ymm8
 ; AVX2-SLOW-NEXT:    vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
@@ -9211,7 +9540,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vmovdqa (%r9), %ymm1
 ; AVX2-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
 ; AVX2-FAST-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15]
@@ -9249,7 +9578,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vmovdqa (%rsi), %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX2-FAST-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
@@ -9338,7 +9668,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpermd %ymm9, %ymm13, %ymm14
 ; AVX2-FAST-NEXT:    vpblendvb %ymm11, %ymm0, %ymm14, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX2-FAST-NEXT:    # ymm13 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm13, %ymm15, %ymm11
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm14 = ymm12[2,2,2,2,6,6,6,6]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7,8,9],ymm14[10],ymm11[11,12],ymm14[13],ymm11[14,15]
@@ -9389,7 +9720,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpblendvb %ymm14, %ymm1, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX2-FAST-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm0, %ymm2, %ymm1
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm3 = ymm4[3,3,3,3,7,7,7,7]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15]
@@ -9713,11 +10045,11 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpblendvb %ymm1, %ymm7, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
 ; AVX2-FAST-NEXT:    vpshufb %xmm0, %xmm11, %xmm1
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm15[1,1,2,2]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
@@ -10241,12 +10573,12 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm3, %ymm9, %ymm0, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm14, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    # xmm3 = mem[1,1,2,2]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rsp), %xmm4 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
@@ -10400,7 +10732,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm8, %ymm6, %ymm3, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX2-FAST-PERLANE-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm1, %ymm0, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
@@ -10443,7 +10776,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm6, %ymm7, %ymm15
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm7, %ymm14, %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
@@ -10483,7 +10816,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm10, %ymm15, %ymm7, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX2-FAST-PERLANE-NEXT:    # ymm10 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm10, %ymm7, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    # ymm8 = mem[2,2,2,2,6,6,6,6]
@@ -10597,7 +10931,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,2,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm1, %ymm12, %ymm13, %ymm12
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX2-FAST-PERLANE-NEXT:    # ymm13 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm13, %ymm15, %ymm14
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    # ymm15 = mem[3,3,3,3,7,7,7,7]
@@ -10777,7 +11112,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [2,1,3,2,10,10,10,11]
 ; AVX512F-SLOW-NEXT:    vpermi2q %zmm8, %zmm0, %zmm10
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27>
+; AVX512F-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX512F-SLOW-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm20, %ymm10
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm8, %ymm10, %ymm0
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm8, %ymm17
@@ -10998,7 +11334,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512F-SLOW-NEXT:    vmovdqa 96(%rcx), %xmm1
 ; AVX512F-SLOW-NEXT:    vmovdqa 96(%rdx), %xmm7
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm1, %xmm9
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm10 = xmm7[1,1,2,2]
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm4 = xmm10[0],xmm9[1],xmm10[2,3],xmm9[4],xmm10[5,6],xmm9[7]
@@ -11416,728 +11752,1465 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vzeroupper
 ; AVX512F-SLOW-NEXT:    retq
 ;
-; AVX512F-FAST-LABEL: store_i16_stride7_vf64:
-; AVX512F-FAST:       # %bb.0:
-; AVX512F-FAST-NEXT:    subq $2200, %rsp # imm = 0x898
-; AVX512F-FAST-NEXT:    vmovdqa 96(%rsi), %ymm1
-; AVX512F-FAST-NEXT:    vmovdqa 96(%rdi), %ymm6
-; AVX512F-FAST-NEXT:    vmovdqa 96(%rcx), %ymm3
-; AVX512F-FAST-NEXT:    vmovdqa 96(%rdx), %ymm15
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm3, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm3, %ymm25
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u>
-; AVX512F-FAST-NEXT:    vpshufb %ymm7, %ymm15, %ymm3
-; AVX512F-FAST-NEXT:    vporq %ymm0, %ymm3, %ymm17
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512F-FAST-NEXT:    vpshufb %ymm3, %ymm1, %ymm4
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm1, %ymm20
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19>
-; AVX512F-FAST-NEXT:    vpshufb %ymm0, %ymm6, %ymm5
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm6, %ymm23
-; AVX512F-FAST-NEXT:    vporq %ymm4, %ymm5, %ymm16
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512F-FAST-NEXT:    vmovdqa 64(%r9), %ymm1
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm1, %ymm4
-; AVX512F-FAST-NEXT:    vmovdqa %ymm5, %ymm6
-; AVX512F-FAST-NEXT:    vmovdqa 64(%r8), %ymm1
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u>
-; AVX512F-FAST-NEXT:    vpshufb %ymm8, %ymm1, %ymm5
-; AVX512F-FAST-NEXT:    vporq %ymm4, %ymm5, %ymm24
-; AVX512F-FAST-NEXT:    vmovdqa 64(%rcx), %ymm1
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm4
-; AVX512F-FAST-NEXT:    vmovdqa 64(%rdx), %ymm1
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb %ymm7, %ymm1, %ymm5
-; AVX512F-FAST-NEXT:    vpor %ymm4, %ymm5, %ymm1
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa 64(%rsi), %ymm1
-; AVX512F-FAST-NEXT:    vpshufb %ymm3, %ymm1, %ymm4
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm1, %ymm30
-; AVX512F-FAST-NEXT:    vmovdqa 64(%rdi), %ymm14
-; AVX512F-FAST-NEXT:    vpshufb %ymm0, %ymm14, %ymm5
-; AVX512F-FAST-NEXT:    vpor %ymm4, %ymm5, %ymm1
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa (%r9), %ymm1
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb %ymm6, %ymm1, %ymm4
-; AVX512F-FAST-NEXT:    vmovdqa %ymm6, %ymm10
-; AVX512F-FAST-NEXT:    vmovdqa (%r8), %ymm1
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb %ymm8, %ymm1, %ymm5
-; AVX512F-FAST-NEXT:    vporq %ymm4, %ymm5, %ymm31
-; AVX512F-FAST-NEXT:    vmovdqa (%rcx), %ymm1
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm4
-; AVX512F-FAST-NEXT:    vmovdqa (%rdx), %ymm1
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb %ymm7, %ymm1, %ymm5
-; AVX512F-FAST-NEXT:    vpor %ymm4, %ymm5, %ymm1
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa (%rsi), %ymm1
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb %ymm3, %ymm1, %ymm4
-; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb %ymm0, %ymm1, %ymm5
-; AVX512F-FAST-NEXT:    vpor %ymm4, %ymm5, %ymm1
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa 32(%rcx), %ymm13
-; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm13, %ymm2
-; AVX512F-FAST-NEXT:    vmovdqa 32(%rdx), %ymm12
-; AVX512F-FAST-NEXT:    vpshufb %ymm7, %ymm12, %ymm4
-; AVX512F-FAST-NEXT:    vpor %ymm2, %ymm4, %ymm1
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa 32(%rsi), %ymm11
-; AVX512F-FAST-NEXT:    vpshufb %ymm3, %ymm11, %ymm2
-; AVX512F-FAST-NEXT:    vmovdqa 32(%rdi), %ymm7
-; AVX512F-FAST-NEXT:    vpshufb %ymm0, %ymm7, %ymm0
-; AVX512F-FAST-NEXT:    vpor %ymm2, %ymm0, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa 32(%r8), %ymm6
-; AVX512F-FAST-NEXT:    vpshufb %ymm8, %ymm6, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqa 32(%r9), %ymm5
-; AVX512F-FAST-NEXT:    vpshufb %ymm10, %ymm5, %ymm9
-; AVX512F-FAST-NEXT:    vporq %ymm9, %ymm0, %ymm22
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm9 = ymm6[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7,8,9],ymm9[10],ymm0[11,12],ymm9[13],ymm0[14,15]
-; AVX512F-FAST-NEXT:    vprold $16, %ymm5, %ymm9
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm8 = ymm6[1,2,2,3,5,6,6,7]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7,8,9],ymm9[10],ymm8[11,12],ymm9[13],ymm8[14,15]
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,1,3,2,10,10,10,11]
-; AVX512F-FAST-NEXT:    vpermi2q %zmm0, %zmm8, %zmm1
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u>
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm20, %ymm1
-; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm2, %ymm29
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm8 = ymm23[2,2,2,2,6,6,6,6]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6,7,8],ymm0[9],ymm8[10,11],ymm0[12],ymm8[13,14,15]
-; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29>
-; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm2, %ymm20
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm8 = ymm23[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3],ymm8[4,5],ymm0[6],ymm8[7,8,9,10],ymm0[11],ymm8[12,13],ymm0[14],ymm8[15]
-; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27>
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm25, %ymm4
-; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm4, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm2, %ymm21
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm8 = ymm15[2,2,2,2,6,6,6,6]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7,8,9],ymm8[10],ymm0[11,12],ymm8[13],ymm0[14,15]
-; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u>
-; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm4, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm2, %ymm25
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm8 = ymm15[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6,7,8],ymm0[9],ymm8[10,11],ymm0[12],ymm8[13,14,15]
-; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm8 = ymm15[0,1,1,3,4,5,5,7]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23>
-; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm8
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm2, %ymm18
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm9 = ymm23[1,1,1,1,5,5,5,5]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7,8,9],ymm9[10],ymm8[11,12],ymm9[13],ymm8[14,15]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm17, %zmm0
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm8, %zmm16, %zmm8
-; AVX512F-FAST-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <5,u,u,u,6,u,u,6>
-; AVX512F-FAST-NEXT:    vmovdqa 96(%r8), %ymm9
-; AVX512F-FAST-NEXT:    vpermd %ymm9, %ymm0, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,4,u,u,u,5,u,u>
-; AVX512F-FAST-NEXT:    vpermd %ymm9, %ymm0, %ymm0
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[14,15],zero,zero,ymm9[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[16,17],zero,zero,ymm9[u,u],zero,zero
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
-; AVX512F-FAST-NEXT:    vpternlogq $248, %ymm16, %ymm8, %ymm9
-; AVX512F-FAST-NEXT:    vmovdqa 96(%r9), %ymm1
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb %ymm10, %ymm1, %ymm15
-; AVX512F-FAST-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm15
-; AVX512F-FAST-NEXT:    vextracti64x4 $1, %zmm8, %ymm8
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm0
-; AVX512F-FAST-NEXT:    vprold $16, %ymm1, %ymm8
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2]
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm8
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
-; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm15[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-FAST-NEXT:    vmovdqa64 64(%rax), %zmm23
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,5,u,u,u,6,u,u,30,u,u,u,31,u,u,31>
-; AVX512F-FAST-NEXT:    vmovdqa 96(%rax), %ymm0
-; AVX512F-FAST-NEXT:    vpermi2d %zmm23, %zmm0, %zmm1
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm17 = [0,1,4,5,4,5,5,7]
-; AVX512F-FAST-NEXT:    vpermd %ymm0, %ymm17, %ymm8
-; AVX512F-FAST-NEXT:    vpandnq %ymm8, %ymm16, %ymm8
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512F-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512F-FAST-NEXT:    vpbroadcastd 72(%rax), %ymm0
-; AVX512F-FAST-NEXT:    vpandn %ymm0, %ymm1, %ymm8
-; AVX512F-FAST-NEXT:    vmovdqa 64(%rax), %ymm10
-; AVX512F-FAST-NEXT:    vpshufb %ymm3, %ymm10, %ymm9
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm27
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm24, %zmm0, %zmm15
-; AVX512F-FAST-NEXT:    vmovdqa 64(%r9), %xmm2
-; AVX512F-FAST-NEXT:    vmovdqa 64(%r8), %xmm9
-; AVX512F-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm2, %xmm28
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512F-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [0,0,1,1,12,13,14,15]
-; AVX512F-FAST-NEXT:    vpermt2q %zmm15, %zmm24, %zmm0
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm26 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
-; AVX512F-FAST-NEXT:    vpternlogq $248, %zmm26, %zmm0, %zmm27
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vpbroadcastd 8(%rax), %ymm0
-; AVX512F-FAST-NEXT:    vpandn %ymm0, %ymm1, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqa (%rax), %ymm1
-; AVX512F-FAST-NEXT:    vpshufb %ymm3, %ymm1, %ymm15
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm1, %ymm19
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm15, %zmm0, %zmm1
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm31, %zmm0, %zmm0
-; AVX512F-FAST-NEXT:    vmovdqa (%r9), %xmm4
-; AVX512F-FAST-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa (%r8), %xmm8
-; AVX512F-FAST-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
-; AVX512F-FAST-NEXT:    vpshufb %xmm2, %xmm15, %xmm15
-; AVX512F-FAST-NEXT:    vpermt2q %zmm0, %zmm24, %zmm15
-; AVX512F-FAST-NEXT:    vpternlogq $248, %zmm26, %zmm15, %zmm1
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm15 = ymm12[0,1,1,3,4,5,5,7]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7,8,9],ymm0[10],ymm15[11,12],ymm0[13],ymm15[14,15]
-; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm18, %ymm4
-; AVX512F-FAST-NEXT:    vpshufb %ymm4, %ymm11, %ymm0
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm15 = ymm7[1,1,1,1,5,5,5,5]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15]
-; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21>
-; AVX512F-FAST-NEXT:    vpshufb %ymm8, %ymm5, %ymm0
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm6[0,0,2,1,4,4,6,5]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,u,3,10,10,11,11>
-; AVX512F-FAST-NEXT:    vpermi2q %zmm0, %zmm22, %zmm1
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa 32(%rax), %ymm0
-; AVX512F-FAST-NEXT:    vpermd %ymm0, %ymm17, %ymm2
-; AVX512F-FAST-NEXT:    vpandnq %ymm2, %ymm16, %ymm2
-; AVX512F-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm1
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm29, %ymm5
-; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm11, %ymm1
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm7[2,2,2,2,6,6,6,6]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm20, %ymm6
-; AVX512F-FAST-NEXT:    vpshufb %ymm6, %ymm11, %ymm1
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm21, %ymm3
-; AVX512F-FAST-NEXT:    vpshufb %ymm3, %ymm13, %ymm1
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm12[2,2,2,2,6,6,6,6]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm25, %ymm15
-; AVX512F-FAST-NEXT:    vpshufb %ymm15, %ymm13, %ymm1
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa64 (%rax), %zmm21
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,21,u,u,u,22,u,u,14,u,u,u,15,u,u,15>
-; AVX512F-FAST-NEXT:    vpermi2d %zmm0, %zmm21, %zmm1
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm30, %ymm2
-; AVX512F-FAST-NEXT:    vpshufb %ymm4, %ymm2, %ymm0
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm1 = ymm14[1,1,1,1,5,5,5,5]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm2, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm30, %ymm4
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm1 = ymm14[2,2,2,2,6,6,6,6]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
-; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm5[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[0,1,1,3,4,5,5,7]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb %ymm3, %ymm5, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqa %ymm5, %ymm11
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512F-FAST-NEXT:    vmovdqa %ymm2, %ymm7
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512F-FAST-NEXT:    vprold $16, %ymm2, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm1 = ymm5[1,2,2,3,5,6,6,7]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512F-FAST-NEXT:    vpshufb %ymm8, %ymm2, %ymm1
-; AVX512F-FAST-NEXT:    vmovdqa %ymm2, %ymm3
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm5[0,0,2,1,4,4,6,5]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15]
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [2,2,3,3,10,9,11,10]
-; AVX512F-FAST-NEXT:    vpermt2q %zmm0, %zmm12, %zmm2
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm10, %zmm23, %zmm0
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,4,u,u,u,5,u,u,13,u,u,u,14,u,u>
-; AVX512F-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512F-FAST-NEXT:    vpternlogq $184, %zmm2, %zmm1, %zmm0
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb %ymm15, %ymm11, %ymm0
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6,7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14,15]
-; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb %ymm6, %ymm4, %ymm0
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8,9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15]
-; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm22 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31>
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm22, %ymm0
-; AVX512F-FAST-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15]
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [2,2,2,3,8,8,8,9]
-; AVX512F-FAST-NEXT:    vmovdqa 96(%r9), %xmm5
-; AVX512F-FAST-NEXT:    vmovdqa 96(%r8), %xmm6
-; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
-; AVX512F-FAST-NEXT:    vpshufb %xmm3, %xmm7, %xmm2
-; AVX512F-FAST-NEXT:    vmovdqa %xmm3, %xmm14
-; AVX512F-FAST-NEXT:    vpermt2q %zmm2, %zmm4, %zmm0
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,7,3,3,7,7,6,7]
-; AVX512F-FAST-NEXT:    vpermd %ymm10, %ymm2, %ymm3
-; AVX512F-FAST-NEXT:    vpbroadcastd 96(%rax), %ymm4
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa 96(%rsi), %xmm0
-; AVX512F-FAST-NEXT:    vmovdqa 96(%rdi), %xmm4
-; AVX512F-FAST-NEXT:    vprold $16, %xmm0, %xmm10
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} xmm11 = xmm4[1,1,2,3]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm3 = xmm11[0,1],xmm10[2],xmm11[3,4],xmm10[5],xmm11[6,7]
-; AVX512F-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm3, %xmm30
-; AVX512F-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm0, %xmm23
-; AVX512F-FAST-NEXT:    vmovdqa 96(%rcx), %xmm0
-; AVX512F-FAST-NEXT:    vmovdqa 96(%rdx), %xmm10
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm15 = <u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9>
-; AVX512F-FAST-NEXT:    vpshufb %xmm15, %xmm0, %xmm4
-; AVX512F-FAST-NEXT:    vmovdqa %xmm15, %xmm8
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} xmm11 = xmm10[1,1,2,2]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm3 = xmm11[0],xmm4[1],xmm11[2,3],xmm4[4],xmm11[5,6],xmm4[7]
-; AVX512F-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
-; AVX512F-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7]
-; AVX512F-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
-; AVX512F-FAST-NEXT:    vpshufb %xmm15, %xmm7, %xmm5
-; AVX512F-FAST-NEXT:    vmovdqa %xmm15, %xmm7
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,3,8,8,9,9]
-; AVX512F-FAST-NEXT:    vpermt2q %zmm0, %zmm3, %zmm5
-; AVX512F-FAST-NEXT:    vpbroadcastd 100(%rax), %ymm0
-; AVX512F-FAST-NEXT:    vpbroadcastd 104(%rax), %ymm6
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm0
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa 64(%rcx), %xmm0
-; AVX512F-FAST-NEXT:    vmovdqa 64(%rdx), %xmm5
-; AVX512F-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm3, %xmm31
-; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; AVX512F-FAST-NEXT:    vmovdqa %xmm8, %xmm2
-; AVX512F-FAST-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1],xmm5[2,3],xmm0[4],xmm5[5,6],xmm0[7]
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm0, %ymm18
-; AVX512F-FAST-NEXT:    vmovdqa 64(%rdi), %xmm0
-; AVX512F-FAST-NEXT:    vmovdqa 64(%rsi), %xmm5
-; AVX512F-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm15, %xmm17
-; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
-; AVX512F-FAST-NEXT:    vprold $16, %xmm5, %xmm5
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2],xmm0[3,4],xmm5[5],xmm0[6,7]
-; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm28, %xmm0
-; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
-; AVX512F-FAST-NEXT:    vmovdqa %xmm7, %xmm13
-; AVX512F-FAST-NEXT:    vpshufb %xmm7, %xmm0, %xmm5
-; AVX512F-FAST-NEXT:    vpshufb %xmm14, %xmm0, %xmm0
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,0,0,1,8,9,9,11]
-; AVX512F-FAST-NEXT:    vpermt2q %zmm5, %zmm6, %zmm0
-; AVX512F-FAST-NEXT:    vpbroadcastd 64(%rax), %ymm5
-; AVX512F-FAST-NEXT:    vpbroadcastd 68(%rax), %ymm7
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm7, %zmm5, %zmm3
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512F-FAST-NEXT:    vpternlogq $184, %zmm0, %zmm8, %zmm3
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa (%rcx), %xmm0
-; AVX512F-FAST-NEXT:    vmovdqa (%rdx), %xmm5
-; AVX512F-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm3, %xmm16
-; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; AVX512F-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1],xmm5[2,3],xmm0[4],xmm5[5,6],xmm0[7]
-; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-FAST-NEXT:    vmovdqa (%rsi), %xmm9
-; AVX512F-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm2, %xmm28
-; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3]
-; AVX512F-FAST-NEXT:    vprold $16, %xmm9, %xmm9
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm9[2],xmm0[3,4],xmm9[5],xmm0[6,7]
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm0, %ymm27
-; AVX512F-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512F-FAST-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512F-FAST-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512F-FAST-NEXT:    vpshufb %xmm13, %xmm0, %xmm9
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm13, %xmm29
-; AVX512F-FAST-NEXT:    vpshufb %xmm14, %xmm0, %xmm0
-; AVX512F-FAST-NEXT:    vmovdqa %xmm14, %xmm2
-; AVX512F-FAST-NEXT:    vpermt2q %zmm9, %zmm6, %zmm0
-; AVX512F-FAST-NEXT:    vpbroadcastd (%rax), %ymm6
-; AVX512F-FAST-NEXT:    vpbroadcastd 4(%rax), %ymm9
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm9, %zmm6, %zmm3
-; AVX512F-FAST-NEXT:    vpternlogq $184, %zmm0, %zmm8, %zmm3
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm6 = ymm3[1,1,1,1,5,5,5,5]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7,8,9],ymm6[10],ymm0[11,12],ymm6[13],ymm0[14,15]
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm0, %ymm26
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm13[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm6 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512F-FAST-NEXT:    vmovdqa %ymm3, %ymm14
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2,3],ymm0[4],ymm6[5,6,7,8],ymm0[9],ymm6[10,11],ymm0[12],ymm6[13,14,15]
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm0, %ymm25
-; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX512F-FAST-NEXT:    vprold $16, %ymm3, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm6 = ymm9[1,2,2,3,5,6,6,7]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0,1],ymm0[2],ymm6[3,4],ymm0[5],ymm6[6,7,8,9],ymm0[10],ymm6[11,12],ymm0[13],ymm6[14,15]
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm8 = ymm9[0,0,2,1,4,4,6,5]
-; AVX512F-FAST-NEXT:    vmovdqa %ymm9, %ymm10
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7,8,9,10],ymm8[11],ymm6[12,13],ymm8[14],ymm6[15]
-; AVX512F-FAST-NEXT:    vpermt2q %zmm0, %zmm12, %zmm6
-; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm9[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm8 = ymm12[0,1,1,3,4,5,5,7]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15]
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm0, %ymm24
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm8 = ymm12[2,2,2,2,6,6,6,6]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7,8,9],ymm8[10],ymm0[11,12],ymm8[13],ymm0[14,15]
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm0, %ymm20
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm19, %zmm21, %zmm0
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,u,4,u,u,u,5,u,u,13,u,u,u,14,u,u>
-; AVX512F-FAST-NEXT:    vpermd %zmm0, %zmm8, %zmm0
-; AVX512F-FAST-NEXT:    vpternlogq $184, %zmm6, %zmm1, %zmm0
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm9[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
-; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
-; AVX512F-FAST-NEXT:    vpshufb %xmm1, %xmm4, %xmm0
-; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb %xmm1, %xmm11, %xmm0
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm0, %ymm21
-; AVX512F-FAST-NEXT:    vpshufb %xmm1, %xmm7, %xmm12
-; AVX512F-FAST-NEXT:    vmovdqa 32(%rcx), %xmm9
-; AVX512F-FAST-NEXT:    vmovdqa 32(%rdx), %xmm6
-; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
-; AVX512F-FAST-NEXT:    vpshufb %xmm1, %xmm4, %xmm0
-; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8,9,10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15]
-; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm22, %ymm0
-; AVX512F-FAST-NEXT:    vpshufb %ymm0, %ymm3, %ymm1
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm4 = ymm10[3,3,3,3,7,7,7,7]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15]
-; AVX512F-FAST-NEXT:    vmovdqa 32(%r9), %xmm7
-; AVX512F-FAST-NEXT:    vmovdqa 32(%r8), %xmm8
-; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
-; AVX512F-FAST-NEXT:    vpshufb %xmm2, %xmm4, %xmm11
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [2,2,2,3,8,8,8,9]
-; AVX512F-FAST-NEXT:    vpermt2q %zmm11, %zmm0, %zmm1
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [6,7,3,3,7,7,6,7]
-; AVX512F-FAST-NEXT:    vpermd %ymm19, %ymm0, %ymm2
-; AVX512F-FAST-NEXT:    vpbroadcastd 32(%rax), %ymm11
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm11, %zmm2, %zmm22
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm22
-; AVX512F-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm30, %xmm0
-; AVX512F-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb %xmm2, %xmm15, %xmm8
-; AVX512F-FAST-NEXT:    vpshufb %xmm2, %xmm5, %xmm11
-; AVX512F-FAST-NEXT:    vmovdqa 32(%rdi), %xmm7
-; AVX512F-FAST-NEXT:    vmovdqa 32(%rsi), %xmm0
-; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
-; AVX512F-FAST-NEXT:    vpshufb %xmm2, %xmm5, %xmm3
-; AVX512F-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm29, %xmm2
-; AVX512F-FAST-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,1,3,8,8,9,9]
-; AVX512F-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm4
-; AVX512F-FAST-NEXT:    vpbroadcastd 36(%rax), %ymm1
-; AVX512F-FAST-NEXT:    vpbroadcastd 40(%rax), %ymm5
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm10
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm10
-; AVX512F-FAST-NEXT:    vprold $16, %xmm0, %xmm1
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3,4],xmm1[5],xmm4[6,7]
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm17, %xmm15
-; AVX512F-FAST-NEXT:    vpshufb %xmm4, %xmm15, %xmm1
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm28, %xmm2
-; AVX512F-FAST-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm23, %xmm3
-; AVX512F-FAST-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX512F-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm9[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
-; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm6[1,1,2,2]
-; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3],xmm0[4],xmm4[5,6],xmm0[7]
-; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm23 = mem[0,2,2,3]
-; AVX512F-FAST-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm4 = mem[2,1,3,3]
-; AVX512F-FAST-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm3 = mem[2,2,2,3]
-; AVX512F-FAST-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm0 = mem[0,2,2,3]
-; AVX512F-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
-; AVX512F-FAST-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512F-FAST-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm5[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[3,3,3,3]
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm15 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2]
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm31, %xmm5
-; AVX512F-FAST-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm5[0,2,3,3,4,5,6,7]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm16, %xmm5
-; AVX512F-FAST-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm5[0,2,3,3,4,5,6,7]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3]
-; AVX512F-FAST-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm6 = mem[2,1,3,2]
-; AVX512F-FAST-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm7 = mem[2,2,2,3]
-; AVX512F-FAST-NEXT:    vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm5 = mem[0,2,2,3]
-; AVX512F-FAST-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vpermpd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm5 = mem[2,1,3,3]
-; AVX512F-FAST-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm5 = mem[2,2,2,3]
-; AVX512F-FAST-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm5 = mem[0,2,2,3]
-; AVX512F-FAST-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm31 = mem[2,2,2,3]
-; AVX512F-FAST-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm30 = mem[0,2,2,3]
-; AVX512F-FAST-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm29 = mem[2,1,3,2]
-; AVX512F-FAST-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm28 = mem[2,2,2,3]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm19 = ymm21[0,1,1,3]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm18 = ymm18[0,0,1,1]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm17 = ymm8[0,0,1,1]
-; AVX512F-FAST-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm8 = mem[0,0,2,1]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm21 = ymm12[0,1,1,3]
-; AVX512F-FAST-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm12 = mem[0,0,1,1]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm16 = ymm11[0,0,1,1]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm11 = ymm27[0,0,2,1]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm27 = ymm26[2,2,2,3]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm26 = ymm25[0,2,2,3]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm25 = ymm24[2,1,3,2]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm24 = ymm20[2,2,2,3]
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm4, %zmm23, %zmm4
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm0
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512F-FAST-NEXT:    vpternlogq $226, %zmm4, %zmm3, %zmm0
-; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm20
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm20, %ymm15
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm4 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512F-FAST-NEXT:    vpternlogq $226, %zmm4, %zmm23, %zmm1
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
-; AVX512F-FAST-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    vpternlogq $226, %zmm13, %zmm23, %zmm2
-; AVX512F-FAST-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
-; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm4
-; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
-; AVX512F-FAST-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm6
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
-; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-FAST-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm20 # 64-byte Folded Reload
-; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-FAST-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload
-; AVX512F-FAST-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512F-FAST-NEXT:    vpternlogd $226, 124(%r8){1to8}, %ymm4, %ymm0
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm9
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm0
-; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm15[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-FAST-NEXT:    vpternlogq $184, %zmm0, %zmm4, %zmm9
-; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-FAST-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm13 # 64-byte Folded Reload
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm18, %zmm19, %zmm0
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm8, %zmm17, %zmm4
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
-; AVX512F-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm7, %zmm4
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm12, %zmm21, %zmm0
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm11, %zmm16, %zmm8
-; AVX512F-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm7, %zmm8
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
-; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512F-FAST-NEXT:    vpternlogq $184, %zmm4, %zmm0, %zmm21
-; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512F-FAST-NEXT:    vpternlogq $184, %zmm8, %zmm0, %zmm17
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm30, %zmm31, %zmm0
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm28, %zmm29, %zmm4
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
-; AVX512F-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm7, %zmm4
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm26, %zmm27, %zmm0
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm24, %zmm25, %zmm8
-; AVX512F-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm7, %zmm8
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
-; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512F-FAST-NEXT:    vpternlogq $184, %zmm4, %zmm0, %zmm26
-; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512F-FAST-NEXT:    vpternlogq $184, %zmm8, %zmm0, %zmm29
-; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm3, %zmm4
-; AVX512F-FAST-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm0 = mem[0,2,2,3]
-; AVX512F-FAST-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm7 = mem[0,1,1,3]
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm0
-; AVX512F-FAST-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm7 = mem[2,1,3,3]
-; AVX512F-FAST-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm8 = mem[0,0,1,1]
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
-; AVX512F-FAST-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm8 = mem[0,0,2,1]
-; AVX512F-FAST-NEXT:    vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm27 = mem[0,0,1,3]
-; AVX512F-FAST-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm24 = mem[0,0,1,1]
-; AVX512F-FAST-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
-; AVX512F-FAST-NEXT:    # xmm11 = mem[0,2,3,3,4,5,6,7]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
-; AVX512F-FAST-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm12 = mem[0,2,2,3]
-; AVX512F-FAST-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm25 = mem[0,1,1,3]
-; AVX512F-FAST-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm15 = mem[2,1,3,3]
-; AVX512F-FAST-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm16 = mem[0,0,1,1]
-; AVX512F-FAST-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm28 = mem[0,0,2,1]
-; AVX512F-FAST-NEXT:    vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm18 = mem[0,0,1,3]
-; AVX512F-FAST-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm19 = mem[0,0,1,1]
-; AVX512F-FAST-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
-; AVX512F-FAST-NEXT:    # xmm14 = mem[0,2,3,3,4,5,6,7]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
-; AVX512F-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm3, %zmm7
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm25, %zmm12, %zmm0
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm16, %zmm15, %zmm12
-; AVX512F-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm3, %zmm12
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
-; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-FAST-NEXT:    vpternlogq $184, %zmm7, %zmm0, %zmm15
-; AVX512F-FAST-NEXT:    vpternlogq $184, %zmm12, %zmm0, %zmm22
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm27, %zmm8, %zmm0
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm11, %zmm24, %zmm3
-; AVX512F-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm23, %zmm3
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm18, %zmm28, %zmm0
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm14, %zmm19, %zmm7
-; AVX512F-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm23, %zmm7
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
-; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-FAST-NEXT:    vpternlogq $184, %zmm3, %zmm0, %zmm8
-; AVX512F-FAST-NEXT:    vpternlogq $184, %zmm7, %zmm0, %zmm10
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm5
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm13
-; AVX512F-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-FAST-NEXT:    vmovdqa64 %zmm5, 320(%rax)
-; AVX512F-FAST-NEXT:    vmovdqa64 %zmm10, 256(%rax)
-; AVX512F-FAST-NEXT:    vmovdqa64 %zmm22, 192(%rax)
-; AVX512F-FAST-NEXT:    vmovdqa64 %zmm29, 128(%rax)
-; AVX512F-FAST-NEXT:    vmovdqa64 %zmm2, 64(%rax)
-; AVX512F-FAST-NEXT:    vmovdqa64 %zmm17, (%rax)
-; AVX512F-FAST-NEXT:    vmovdqa64 %zmm21, 448(%rax)
-; AVX512F-FAST-NEXT:    vmovdqa64 %zmm8, 704(%rax)
-; AVX512F-FAST-NEXT:    vmovdqa64 %zmm15, 640(%rax)
-; AVX512F-FAST-NEXT:    vmovdqa64 %zmm26, 576(%rax)
-; AVX512F-FAST-NEXT:    vmovdqa64 %zmm1, 512(%rax)
-; AVX512F-FAST-NEXT:    vmovdqa64 %zmm13, 384(%rax)
-; AVX512F-FAST-NEXT:    vmovdqa64 %zmm20, 768(%rax)
-; AVX512F-FAST-NEXT:    vmovdqa64 %zmm9, 832(%rax)
-; AVX512F-FAST-NEXT:    addq $2200, %rsp # imm = 0x898
-; AVX512F-FAST-NEXT:    vzeroupper
-; AVX512F-FAST-NEXT:    retq
+; AVX512F-ONLY-FAST-LABEL: store_i16_stride7_vf64:
+; AVX512F-ONLY-FAST:       # %bb.0:
+; AVX512F-ONLY-FAST-NEXT:    subq $2200, %rsp # imm = 0x898
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 96(%rsi), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 96(%rdi), %ymm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 96(%rcx), %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 96(%rdx), %ymm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm2, %ymm3, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm3, %ymm25
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm7, %ymm15, %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vporq %ymm0, %ymm3, %ymm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm3, %ymm1, %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm1, %ymm20
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19>
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm0, %ymm6, %ymm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm6, %ymm23
+; AVX512F-ONLY-FAST-NEXT:    vporq %ymm4, %ymm5, %ymm16
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%r9), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm5, %ymm1, %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %ymm5, %ymm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%r8), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm8, %ymm1, %ymm5
+; AVX512F-ONLY-FAST-NEXT:    vporq %ymm4, %ymm5, %ymm24
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rcx), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm7, %ymm1, %ymm5
+; AVX512F-ONLY-FAST-NEXT:    vpor %ymm4, %ymm5, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm3, %ymm1, %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm1, %ymm30
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rdi), %ymm14
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm0, %ymm14, %ymm5
+; AVX512F-ONLY-FAST-NEXT:    vpor %ymm4, %ymm5, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%r9), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm6, %ymm1, %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %ymm6, %ymm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%r8), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm8, %ymm1, %ymm5
+; AVX512F-ONLY-FAST-NEXT:    vporq %ymm4, %ymm5, %ymm31
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rcx), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdx), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm7, %ymm1, %ymm5
+; AVX512F-ONLY-FAST-NEXT:    vpor %ymm4, %ymm5, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm3, %ymm1, %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm0, %ymm1, %ymm5
+; AVX512F-ONLY-FAST-NEXT:    vpor %ymm4, %ymm5, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rcx), %ymm13
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm2, %ymm13, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rdx), %ymm12
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm7, %ymm12, %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vpor %ymm2, %ymm4, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rsi), %ymm11
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm3, %ymm11, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rdi), %ymm7
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm0, %ymm7, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%r8), %ymm6
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm8, %ymm6, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%r9), %ymm5
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm10, %ymm5, %ymm9
+; AVX512F-ONLY-FAST-NEXT:    vporq %ymm9, %ymm0, %ymm22
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm9 = ymm6[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7,8,9],ymm9[10],ymm0[11,12],ymm9[13],ymm0[14,15]
+; AVX512F-ONLY-FAST-NEXT:    vprold $16, %ymm5, %ymm9
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm8 = ymm6[1,2,2,3,5,6,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7,8,9],ymm9[10],ymm8[11,12],ymm9[13],ymm8[14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,1,3,2,10,10,10,11]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm8, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm20, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm2, %ymm29
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm8 = ymm23[2,2,2,2,6,6,6,6]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6,7,8],ymm0[9],ymm8[10,11],ymm0[12],ymm8[13,14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29>
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm2, %ymm20
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm8 = ymm23[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3],ymm8[4,5],ymm0[6],ymm8[7,8,9,10],ymm0[11],ymm8[12,13],ymm0[14],ymm8[15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX512F-ONLY-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm25, %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm2, %ymm4, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm2, %ymm21
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm8 = ymm15[2,2,2,2,6,6,6,6]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7,8,9],ymm8[10],ymm0[11,12],ymm8[13],ymm0[14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm2, %ymm4, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm2, %ymm25
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm8 = ymm15[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6,7,8],ymm0[9],ymm8[10,11],ymm0[12],ymm8[13,14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm8 = ymm15[0,1,1,3,4,5,5,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512F-ONLY-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm2, %ymm18
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm9 = ymm23[1,1,1,1,5,5,5,5]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7,8,9],ymm9[10],ymm8[11,12],ymm9[13],ymm8[14,15]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm17, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm8, %zmm16, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <5,u,u,u,6,u,u,6>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 96(%r8), %ymm9
+; AVX512F-ONLY-FAST-NEXT:    vpermd %ymm9, %ymm0, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,4,u,u,u,5,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vpermd %ymm9, %ymm0, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[14,15],zero,zero,ymm9[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[16,17],zero,zero,ymm9[u,u],zero,zero
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $248, %ymm16, %ymm8, %ymm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 96(%r9), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm10, %ymm1, %ymm15
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm15
+; AVX512F-ONLY-FAST-NEXT:    vextracti64x4 $1, %zmm8, %ymm8
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vprold $16, %ymm1, %ymm8
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2]
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm8
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm15[0,1,2,3],zmm0[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rax), %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31]
+; AVX512F-ONLY-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 96(%rax), %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm23, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm17 = [0,1,4,5,4,5,5,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermd %ymm0, %ymm17, %ymm8
+; AVX512F-ONLY-FAST-NEXT:    vpandnq %ymm8, %ymm16, %ymm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd 72(%rax), %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vpandn %ymm0, %ymm1, %ymm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rax), %ymm10
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm3, %ymm10, %ymm9
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm24, %zmm0, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%r9), %xmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%r8), %xmm9
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm2, %xmm28
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [0,0,1,1,12,13,14,15]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm24, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm26 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $248, %zmm26, %zmm0, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd 8(%rax), %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vpandn %ymm0, %ymm1, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rax), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm3, %ymm1, %ymm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm1, %ymm19
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm15, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm31, %zmm0, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%r9), %xmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%r8), %xmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm2, %xmm15, %xmm15
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm24, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $248, %zmm26, %zmm15, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm15 = ymm12[0,1,1,3,4,5,5,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7,8,9],ymm0[10],ymm15[11,12],ymm0[13],ymm15[14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm18, %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm4, %ymm11, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm15 = ymm7[1,1,1,1,5,5,5,5]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm8, %ymm5, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm6[0,0,2,1,4,4,6,5]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,u,3,10,10,11,11>
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm22, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rax), %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vpermd %ymm0, %ymm17, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vpandnq %ymm2, %ymm16, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm29, %ymm5
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm5, %ymm11, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm7[2,2,2,2,6,6,6,6]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm20, %ymm6
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm6, %ymm11, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm21, %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm3, %ymm13, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm25, %ymm15
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm15, %ymm13, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rax), %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [14,21,0,0,15,22,0,15,14,21,0,0,15,22,0,15]
+; AVX512F-ONLY-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2d %zmm0, %zmm21, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm30, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm4, %ymm2, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm1 = ymm14[1,1,1,1,5,5,5,5]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm5, %ymm2, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm30, %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm1 = ymm14[2,2,2,2,6,6,6,6]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm5[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[0,1,1,3,4,5,5,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm3, %ymm5, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %ymm5, %ymm11
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %ymm2, %ymm7
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vprold $16, %ymm2, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm1 = ymm5[1,2,2,3,5,6,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm8, %ymm2, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %ymm2, %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm5[0,0,2,1,4,4,6,5]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [2,2,3,3,10,9,11,10]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm12, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm10, %zmm23, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0]
+; AVX512F-ONLY-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $184, %zmm2, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm15, %ymm11, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6,7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm6, %ymm4, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8,9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm22 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512F-ONLY-FAST-NEXT:    # ymm22 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm22, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [2,2,2,3,8,8,8,9]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 96(%r9), %xmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 96(%r8), %xmm6
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm3, %xmm7, %xmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm3, %xmm14
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm4, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,7,3,3,7,7,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermd %ymm10, %ymm2, %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd 96(%rax), %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 96(%rsi), %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 96(%rdi), %xmm4
+; AVX512F-ONLY-FAST-NEXT:    vprold $16, %xmm0, %xmm10
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} xmm11 = xmm4[1,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm3 = xmm11[0,1],xmm10[2],xmm11[3,4],xmm10[5],xmm11[6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm3, %xmm30
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm0, %xmm23
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 96(%rcx), %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 96(%rdx), %xmm10
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm15 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm15, %xmm0, %xmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm15, %xmm8
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} xmm11 = xmm10[1,1,2,2]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm3 = xmm11[0],xmm4[1],xmm11[2,3],xmm4[4],xmm11[5,6],xmm4[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm15, %xmm7, %xmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm15, %xmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,3,8,8,9,9]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm3, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd 100(%rax), %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd 104(%rax), %ymm6
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rcx), %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %xmm5
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm3, %xmm31
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm8, %xmm2
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1],xmm5[2,3],xmm0[4],xmm5[5,6],xmm0[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm0, %ymm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rdi), %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %xmm5
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm15, %xmm17
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; AVX512F-ONLY-FAST-NEXT:    vprold $16, %xmm5, %xmm5
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2],xmm0[3,4],xmm5[5],xmm0[6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm28, %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm7, %xmm13
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm7, %xmm0, %xmm5
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm14, %xmm0, %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,0,0,1,8,9,9,11]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd 64(%rax), %ymm5
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd 68(%rax), %ymm7
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm7, %zmm5, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $184, %zmm0, %zmm8, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rcx), %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm5
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm3, %xmm16
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1],xmm5[2,3],xmm0[4],xmm5[5,6],xmm0[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rsi), %xmm9
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm2, %xmm28
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3]
+; AVX512F-ONLY-FAST-NEXT:    vprold $16, %xmm9, %xmm9
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm9[2],xmm0[3,4],xmm9[5],xmm0[6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm0, %ymm27
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm13, %xmm0, %xmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm13, %xmm29
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm14, %xmm0, %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm14, %xmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd (%rax), %ymm6
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd 4(%rax), %ymm9
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm9, %zmm6, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $184, %zmm0, %zmm8, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm6 = ymm3[1,1,1,1,5,5,5,5]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7,8,9],ymm6[10],ymm0[11,12],ymm6[13],ymm0[14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm0, %ymm26
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm13[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm6 = ymm3[2,2,2,2,6,6,6,6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %ymm3, %ymm14
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2,3],ymm0[4],ymm6[5,6,7,8],ymm0[9],ymm6[10,11],ymm0[12],ymm6[13,14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm0, %ymm25
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vprold $16, %ymm3, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm6 = ymm9[1,2,2,3,5,6,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0,1],ymm0[2],ymm6[3,4],ymm0[5],ymm6[6,7,8,9],ymm0[10],ymm6[11,12],ymm0[13],ymm6[14,15]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm8 = ymm9[0,0,2,1,4,4,6,5]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %ymm9, %ymm10
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7,8,9,10],ymm8[11],ymm6[12,13],ymm8[14],ymm6[15]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm12, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm9[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm8 = ymm12[0,1,1,3,4,5,5,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm0, %ymm24
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm8 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7,8,9],ymm8[10],ymm0[11,12],ymm8[13],ymm0[14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm0, %ymm20
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm19, %zmm21, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0]
+; AVX512F-ONLY-FAST-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm0, %zmm8, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $184, %zmm6, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm9[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm1, %xmm4, %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm1, %xmm11, %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm0, %ymm21
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm1, %xmm7, %xmm12
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rcx), %xmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rdx), %xmm6
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm1, %xmm4, %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8,9,10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm22, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm0, %ymm3, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} ymm4 = ymm10[3,3,3,3,7,7,7,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%r9), %xmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%r8), %xmm8
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm2, %xmm4, %xmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [2,2,2,3,8,8,8,9]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [6,7,3,3,7,7,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermd %ymm19, %ymm0, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd 32(%rax), %ymm11
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm11, %zmm2, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm30, %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm2, %xmm15, %xmm8
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm2, %xmm5, %xmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rdi), %xmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rsi), %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm2, %xmm5, %xmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm29, %xmm2
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,1,3,8,8,9,9]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd 36(%rax), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd 40(%rax), %ymm5
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vprold $16, %xmm0, %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3,4],xmm1[5],xmm4[6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm17, %xmm15
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm4, %xmm15, %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm28, %xmm2
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm23, %xmm3
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm9[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm6[1,1,2,2]
+; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3],xmm0[4],xmm4[5,6],xmm0[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm23 = mem[0,2,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm4 = mem[2,1,3,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm3 = mem[2,2,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm0 = mem[0,2,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm5[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[3,3,3,3]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm15 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm31, %xmm5
+; AVX512F-ONLY-FAST-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm5[0,2,3,3,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm16, %xmm5
+; AVX512F-ONLY-FAST-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm5[0,2,3,3,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm6 = mem[2,1,3,2]
+; AVX512F-ONLY-FAST-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm7 = mem[2,2,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm5 = mem[0,2,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermpd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm5 = mem[2,1,3,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm5 = mem[2,2,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm5 = mem[0,2,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm31 = mem[2,2,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm30 = mem[0,2,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm29 = mem[2,1,3,2]
+; AVX512F-ONLY-FAST-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm28 = mem[2,2,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm19 = ymm21[0,1,1,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm18 = ymm18[0,0,1,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm17 = ymm8[0,0,1,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm8 = mem[0,0,2,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm21 = ymm12[0,1,1,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm12 = mem[0,0,1,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm16 = ymm11[0,0,1,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm11 = ymm27[0,0,2,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm27 = ymm26[2,2,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm26 = ymm25[0,2,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm25 = ymm24[2,1,3,2]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm24 = ymm20[2,2,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm4, %zmm23, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $226, %zmm4, %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm20
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm20, %ymm15
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm4 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $226, %zmm4, %zmm23, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $226, %zmm13, %zmm23, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm20 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512F-ONLY-FAST-NEXT:    vpternlogd $226, 124(%r8){1to8}, %ymm4, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm9
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm15[0,1,2,3],zmm0[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $184, %zmm0, %zmm4, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm13 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm18, %zmm19, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm8, %zmm17, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm7, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm12, %zmm21, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm11, %zmm16, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm7, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $184, %zmm4, %zmm0, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $184, %zmm8, %zmm0, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm30, %zmm31, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm28, %zmm29, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm7, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm26, %zmm27, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm24, %zmm25, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm7, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $184, %zmm4, %zmm0, %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $184, %zmm8, %zmm0, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm3, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm0 = mem[0,2,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm7 = mem[0,1,1,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm7 = mem[2,1,3,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm8 = mem[0,0,1,1]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm8 = mem[0,0,2,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm27 = mem[0,0,1,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm24 = mem[0,0,1,1]
+; AVX512F-ONLY-FAST-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # xmm11 = mem[0,2,3,3,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm12 = mem[0,2,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm25 = mem[0,1,1,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm15 = mem[2,1,3,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm16 = mem[0,0,1,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm28 = mem[0,0,2,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm18 = mem[0,0,1,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm19 = mem[0,0,1,1]
+; AVX512F-ONLY-FAST-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # xmm14 = mem[0,2,3,3,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm3, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm25, %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm16, %zmm15, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm3, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $184, %zmm7, %zmm0, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $184, %zmm12, %zmm0, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm27, %zmm8, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm11, %zmm24, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm23, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm18, %zmm28, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm14, %zmm19, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm23, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $184, %zmm3, %zmm0, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $184, %zmm7, %zmm0, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, 320(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 256(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, (%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, 448(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, 704(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, 640(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, 576(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 512(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, 384(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, 768(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 832(%rax)
+; AVX512F-ONLY-FAST-NEXT:    addq $2200, %rsp # imm = 0x898
+; AVX512F-ONLY-FAST-NEXT:    vzeroupper
+; AVX512F-ONLY-FAST-NEXT:    retq
+;
+; AVX512DQ-FAST-LABEL: store_i16_stride7_vf64:
+; AVX512DQ-FAST:       # %bb.0:
+; AVX512DQ-FAST-NEXT:    subq $2200, %rsp # imm = 0x898
+; AVX512DQ-FAST-NEXT:    vmovdqa 96(%rsi), %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqa 96(%rdi), %ymm6
+; AVX512DQ-FAST-NEXT:    vmovdqa 96(%rcx), %ymm3
+; AVX512DQ-FAST-NEXT:    vmovdqa 96(%rdx), %ymm15
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm2, %ymm3, %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm3, %ymm25
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm7, %ymm15, %ymm3
+; AVX512DQ-FAST-NEXT:    vporq %ymm0, %ymm3, %ymm17
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm3, %ymm1, %ymm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm1, %ymm20
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19>
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm0, %ymm6, %ymm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm6, %ymm23
+; AVX512DQ-FAST-NEXT:    vporq %ymm4, %ymm5, %ymm16
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%r9), %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm5, %ymm1, %ymm4
+; AVX512DQ-FAST-NEXT:    vmovdqa %ymm5, %ymm6
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%r8), %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm8, %ymm1, %ymm5
+; AVX512DQ-FAST-NEXT:    vporq %ymm4, %ymm5, %ymm24
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rcx), %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm4
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rdx), %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm7, %ymm1, %ymm5
+; AVX512DQ-FAST-NEXT:    vpor %ymm4, %ymm5, %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rsi), %ymm1
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm3, %ymm1, %ymm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm1, %ymm30
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rdi), %ymm14
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm0, %ymm14, %ymm5
+; AVX512DQ-FAST-NEXT:    vpor %ymm4, %ymm5, %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa (%r9), %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm6, %ymm1, %ymm4
+; AVX512DQ-FAST-NEXT:    vmovdqa %ymm6, %ymm10
+; AVX512DQ-FAST-NEXT:    vmovdqa (%r8), %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm8, %ymm1, %ymm5
+; AVX512DQ-FAST-NEXT:    vporq %ymm4, %ymm5, %ymm31
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rcx), %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm4
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdx), %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm7, %ymm1, %ymm5
+; AVX512DQ-FAST-NEXT:    vpor %ymm4, %ymm5, %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm3, %ymm1, %ymm4
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm0, %ymm1, %ymm5
+; AVX512DQ-FAST-NEXT:    vpor %ymm4, %ymm5, %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rcx), %ymm13
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm2, %ymm13, %ymm2
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rdx), %ymm12
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm7, %ymm12, %ymm4
+; AVX512DQ-FAST-NEXT:    vpor %ymm2, %ymm4, %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rsi), %ymm11
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm3, %ymm11, %ymm2
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rdi), %ymm7
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm0, %ymm7, %ymm0
+; AVX512DQ-FAST-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%r8), %ymm6
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm8, %ymm6, %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%r9), %ymm5
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm10, %ymm5, %ymm9
+; AVX512DQ-FAST-NEXT:    vporq %ymm9, %ymm0, %ymm22
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm9 = ymm6[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7,8,9],ymm9[10],ymm0[11,12],ymm9[13],ymm0[14,15]
+; AVX512DQ-FAST-NEXT:    vprold $16, %ymm5, %ymm9
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm8 = ymm6[1,2,2,3,5,6,6,7]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7,8,9],ymm9[10],ymm8[11,12],ymm9[13],ymm8[14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,1,3,2,10,10,10,11]
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm8, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm20, %ymm1
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm2, %ymm29
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm8 = ymm23[2,2,2,2,6,6,6,6]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6,7,8],ymm0[9],ymm8[10,11],ymm0[12],ymm8[13,14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29>
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm2, %ymm20
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm8 = ymm23[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3],ymm8[4,5],ymm0[6],ymm8[7,8,9,10],ymm0[11],ymm8[12,13],ymm0[14],ymm8[15]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX512DQ-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm25, %ymm4
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm2, %ymm4, %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm2, %ymm21
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm8 = ymm15[2,2,2,2,6,6,6,6]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7,8,9],ymm8[10],ymm0[11,12],ymm8[13],ymm0[14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm2, %ymm4, %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm2, %ymm25
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm8 = ymm15[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6,7,8],ymm0[9],ymm8[10,11],ymm0[12],ymm8[13,14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm8 = ymm15[0,1,1,3,4,5,5,7]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512DQ-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm2, %ymm18
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm9 = ymm23[1,1,1,1,5,5,5,5]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7,8,9],ymm9[10],ymm8[11,12],ymm9[13],ymm8[14,15]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm17, %zmm0
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm8, %zmm16, %zmm8
+; AVX512DQ-FAST-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <5,u,u,u,6,u,u,6>
+; AVX512DQ-FAST-NEXT:    vmovdqa 96(%r8), %ymm9
+; AVX512DQ-FAST-NEXT:    vpermd %ymm9, %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,4,u,u,u,5,u,u>
+; AVX512DQ-FAST-NEXT:    vpermd %ymm9, %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[14,15],zero,zero,ymm9[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[16,17],zero,zero,ymm9[u,u],zero,zero
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
+; AVX512DQ-FAST-NEXT:    vpternlogq $248, %ymm16, %ymm8, %ymm9
+; AVX512DQ-FAST-NEXT:    vmovdqa 96(%r9), %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm10, %ymm1, %ymm15
+; AVX512DQ-FAST-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm15
+; AVX512DQ-FAST-NEXT:    vextracti64x4 $1, %zmm8, %ymm8
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm0
+; AVX512DQ-FAST-NEXT:    vprold $16, %ymm1, %ymm8
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2]
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm8
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm15[0,1,2,3],zmm0[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rax), %zmm23
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x8 {{.*#+}} zmm1 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31]
+; AVX512DQ-FAST-NEXT:    # zmm1 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa 96(%rax), %ymm0
+; AVX512DQ-FAST-NEXT:    vpermi2d %zmm23, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm17 = [0,1,4,5,4,5,5,7]
+; AVX512DQ-FAST-NEXT:    vpermd %ymm0, %ymm17, %ymm8
+; AVX512DQ-FAST-NEXT:    vpandnq %ymm8, %ymm16, %ymm8
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FAST-NEXT:    vpbroadcastd 72(%rax), %ymm0
+; AVX512DQ-FAST-NEXT:    vpandn %ymm0, %ymm1, %ymm8
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rax), %ymm10
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm3, %ymm10, %ymm9
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm27
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm24, %zmm0, %zmm15
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%r9), %xmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%r8), %xmm9
+; AVX512DQ-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm2, %xmm28
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [0,0,1,1,12,13,14,15]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm24, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm26 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512DQ-FAST-NEXT:    vpternlogq $248, %zmm26, %zmm0, %zmm27
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpbroadcastd 8(%rax), %ymm0
+; AVX512DQ-FAST-NEXT:    vpandn %ymm0, %ymm1, %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rax), %ymm1
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm3, %ymm1, %ymm15
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm1, %ymm19
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm15, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm31, %zmm0, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa (%r9), %xmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa (%r8), %xmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm2, %xmm15, %xmm15
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm24, %zmm15
+; AVX512DQ-FAST-NEXT:    vpternlogq $248, %zmm26, %zmm15, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm15 = ymm12[0,1,1,3,4,5,5,7]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7,8,9],ymm0[10],ymm15[11,12],ymm0[13],ymm15[14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm18, %ymm4
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm4, %ymm11, %ymm0
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm15 = ymm7[1,1,1,1,5,5,5,5]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm8, %ymm5, %ymm0
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm6[0,0,2,1,4,4,6,5]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,u,3,10,10,11,11>
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm22, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rax), %ymm0
+; AVX512DQ-FAST-NEXT:    vpermd %ymm0, %ymm17, %ymm2
+; AVX512DQ-FAST-NEXT:    vpandnq %ymm2, %ymm16, %ymm2
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm1
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm29, %ymm5
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm5, %ymm11, %ymm1
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm7[2,2,2,2,6,6,6,6]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm20, %ymm6
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm6, %ymm11, %ymm1
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm21, %ymm3
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm3, %ymm13, %ymm1
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm25, %ymm15
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm15, %ymm13, %ymm1
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rax), %zmm21
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x8 {{.*#+}} zmm1 = [14,21,0,0,15,22,0,15,14,21,0,0,15,22,0,15]
+; AVX512DQ-FAST-NEXT:    # zmm1 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpermi2d %zmm0, %zmm21, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm30, %ymm2
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm4, %ymm2, %ymm0
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm1 = ymm14[1,1,1,1,5,5,5,5]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm5, %ymm2, %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm30, %ymm4
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm1 = ymm14[2,2,2,2,6,6,6,6]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm5[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[0,1,1,3,4,5,5,7]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm3, %ymm5, %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqa %ymm5, %ymm11
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6]
+; AVX512DQ-FAST-NEXT:    vmovdqa %ymm2, %ymm7
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vprold $16, %ymm2, %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm1 = ymm5[1,2,2,3,5,6,6,7]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm8, %ymm2, %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqa %ymm2, %ymm3
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm5[0,0,2,1,4,4,6,5]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [2,2,3,3,10,9,11,10]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm12, %zmm2
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm10, %zmm23, %zmm0
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x8 {{.*#+}} zmm1 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0]
+; AVX512DQ-FAST-NEXT:    # zmm1 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
+; AVX512DQ-FAST-NEXT:    vpternlogq $184, %zmm2, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm15, %ymm11, %ymm0
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6,7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm6, %ymm4, %ymm0
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8,9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm22 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512DQ-FAST-NEXT:    # ymm22 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm22, %ymm0
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [2,2,2,3,8,8,8,9]
+; AVX512DQ-FAST-NEXT:    vmovdqa 96(%r9), %xmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa 96(%r8), %xmm6
+; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm3, %xmm7, %xmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa %xmm3, %xmm14
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm4, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,7,3,3,7,7,6,7]
+; AVX512DQ-FAST-NEXT:    vpermd %ymm10, %ymm2, %ymm3
+; AVX512DQ-FAST-NEXT:    vpbroadcastd 96(%rax), %ymm4
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa 96(%rsi), %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa 96(%rdi), %xmm4
+; AVX512DQ-FAST-NEXT:    vprold $16, %xmm0, %xmm10
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} xmm11 = xmm4[1,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm3 = xmm11[0,1],xmm10[2],xmm11[3,4],xmm10[5],xmm11[6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm3, %xmm30
+; AVX512DQ-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm0, %xmm23
+; AVX512DQ-FAST-NEXT:    vmovdqa 96(%rcx), %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa 96(%rdx), %xmm10
+; AVX512DQ-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm15 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm15, %xmm0, %xmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa %xmm15, %xmm8
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} xmm11 = xmm10[1,1,2,2]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm3 = xmm11[0],xmm4[1],xmm11[2,3],xmm4[4],xmm11[5,6],xmm4[7]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm15, %xmm7, %xmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa %xmm15, %xmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,3,8,8,9,9]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm3, %zmm5
+; AVX512DQ-FAST-NEXT:    vpbroadcastd 100(%rax), %ymm0
+; AVX512DQ-FAST-NEXT:    vpbroadcastd 104(%rax), %ymm6
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rcx), %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rdx), %xmm5
+; AVX512DQ-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm3, %xmm31
+; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
+; AVX512DQ-FAST-NEXT:    vmovdqa %xmm8, %xmm2
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1],xmm5[2,3],xmm0[4],xmm5[5,6],xmm0[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm0, %ymm18
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rdi), %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rsi), %xmm5
+; AVX512DQ-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm15, %xmm17
+; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; AVX512DQ-FAST-NEXT:    vprold $16, %xmm5, %xmm5
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2],xmm0[3,4],xmm5[5],xmm0[6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm28, %xmm0
+; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
+; AVX512DQ-FAST-NEXT:    vmovdqa %xmm7, %xmm13
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm7, %xmm0, %xmm5
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm14, %xmm0, %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,0,0,1,8,9,9,11]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT:    vpbroadcastd 64(%rax), %ymm5
+; AVX512DQ-FAST-NEXT:    vpbroadcastd 68(%rax), %ymm7
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm7, %zmm5, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512DQ-FAST-NEXT:    vpternlogq $184, %zmm0, %zmm8, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rcx), %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdx), %xmm5
+; AVX512DQ-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm3, %xmm16
+; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1],xmm5[2,3],xmm0[4],xmm5[5,6],xmm0[7]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rsi), %xmm9
+; AVX512DQ-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm2, %xmm28
+; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3]
+; AVX512DQ-FAST-NEXT:    vprold $16, %xmm9, %xmm9
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm9[2],xmm0[3,4],xmm9[5],xmm0[6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm0, %ymm27
+; AVX512DQ-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm13, %xmm0, %xmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm13, %xmm29
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm14, %xmm0, %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa %xmm14, %xmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT:    vpbroadcastd (%rax), %ymm6
+; AVX512DQ-FAST-NEXT:    vpbroadcastd 4(%rax), %ymm9
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm9, %zmm6, %zmm3
+; AVX512DQ-FAST-NEXT:    vpternlogq $184, %zmm0, %zmm8, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm6 = ymm3[1,1,1,1,5,5,5,5]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7,8,9],ymm6[10],ymm0[11,12],ymm6[13],ymm0[14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm0, %ymm26
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm13[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm6 = ymm3[2,2,2,2,6,6,6,6]
+; AVX512DQ-FAST-NEXT:    vmovdqa %ymm3, %ymm14
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2,3],ymm0[4],ymm6[5,6,7,8],ymm0[9],ymm6[10,11],ymm0[12],ymm6[13,14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm0, %ymm25
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vprold $16, %ymm3, %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm6 = ymm9[1,2,2,3,5,6,6,7]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0,1],ymm0[2],ymm6[3,4],ymm0[5],ymm6[6,7,8,9],ymm0[10],ymm6[11,12],ymm0[13],ymm6[14,15]
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm8 = ymm9[0,0,2,1,4,4,6,5]
+; AVX512DQ-FAST-NEXT:    vmovdqa %ymm9, %ymm10
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7,8,9,10],ymm8[11],ymm6[12,13],ymm8[14],ymm6[15]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm12, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm9[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm8 = ymm12[0,1,1,3,4,5,5,7]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm0, %ymm24
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm8 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7,8,9],ymm8[10],ymm0[11,12],ymm8[13],ymm0[14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm0, %ymm20
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm19, %zmm21, %zmm0
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x8 {{.*#+}} zmm8 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0]
+; AVX512DQ-FAST-NEXT:    # zmm8 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpermd %zmm0, %zmm8, %zmm0
+; AVX512DQ-FAST-NEXT:    vpternlogq $184, %zmm6, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm9[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm1, %xmm4, %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm1, %xmm11, %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm0, %ymm21
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm1, %xmm7, %xmm12
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rcx), %xmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rdx), %xmm6
+; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm1, %xmm4, %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8,9,10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm22, %ymm0
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm0, %ymm3, %ymm1
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} ymm4 = ymm10[3,3,3,3,7,7,7,7]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%r9), %xmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%r8), %xmm8
+; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm2, %xmm4, %xmm11
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [2,2,2,3,8,8,8,9]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [6,7,3,3,7,7,6,7]
+; AVX512DQ-FAST-NEXT:    vpermd %ymm19, %ymm0, %ymm2
+; AVX512DQ-FAST-NEXT:    vpbroadcastd 32(%rax), %ymm11
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm11, %zmm2, %zmm22
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm22
+; AVX512DQ-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm30, %xmm0
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm2, %xmm15, %xmm8
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm2, %xmm5, %xmm11
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rdi), %xmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rsi), %xmm0
+; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm2, %xmm5, %xmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm29, %xmm2
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,1,3,8,8,9,9]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm4
+; AVX512DQ-FAST-NEXT:    vpbroadcastd 36(%rax), %ymm1
+; AVX512DQ-FAST-NEXT:    vpbroadcastd 40(%rax), %ymm5
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm10
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm10
+; AVX512DQ-FAST-NEXT:    vprold $16, %xmm0, %xmm1
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3,4],xmm1[5],xmm4[6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm17, %xmm15
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm4, %xmm15, %xmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm28, %xmm2
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm23, %xmm3
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm9[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm6[1,1,2,2]
+; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3],xmm0[4],xmm4[5,6],xmm0[7]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm23 = mem[0,2,2,3]
+; AVX512DQ-FAST-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm4 = mem[2,1,3,3]
+; AVX512DQ-FAST-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm3 = mem[2,2,2,3]
+; AVX512DQ-FAST-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm0 = mem[0,2,2,3]
+; AVX512DQ-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm5[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[3,3,3,3]
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm15 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm31, %xmm5
+; AVX512DQ-FAST-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm5[0,2,3,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm16, %xmm5
+; AVX512DQ-FAST-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm5[0,2,3,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3]
+; AVX512DQ-FAST-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm6 = mem[2,1,3,2]
+; AVX512DQ-FAST-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm7 = mem[2,2,2,3]
+; AVX512DQ-FAST-NEXT:    vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm5 = mem[0,2,2,3]
+; AVX512DQ-FAST-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermpd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm5 = mem[2,1,3,3]
+; AVX512DQ-FAST-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm5 = mem[2,2,2,3]
+; AVX512DQ-FAST-NEXT:    vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm5 = mem[0,2,2,3]
+; AVX512DQ-FAST-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm31 = mem[2,2,2,3]
+; AVX512DQ-FAST-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm30 = mem[0,2,2,3]
+; AVX512DQ-FAST-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm29 = mem[2,1,3,2]
+; AVX512DQ-FAST-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm28 = mem[2,2,2,3]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm19 = ymm21[0,1,1,3]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm18 = ymm18[0,0,1,1]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm17 = ymm8[0,0,1,1]
+; AVX512DQ-FAST-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm8 = mem[0,0,2,1]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm21 = ymm12[0,1,1,3]
+; AVX512DQ-FAST-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm12 = mem[0,0,1,1]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm16 = ymm11[0,0,1,1]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm11 = ymm27[0,0,2,1]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm27 = ymm26[2,2,2,3]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm26 = ymm25[0,2,2,3]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm25 = ymm24[2,1,3,2]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm24 = ymm20[2,2,2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm4, %zmm23, %zmm4
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512DQ-FAST-NEXT:    vpternlogq $226, %zmm4, %zmm3, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm20
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm20, %ymm15
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm4 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512DQ-FAST-NEXT:    vpternlogq $226, %zmm4, %zmm23, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
+; AVX512DQ-FAST-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vpternlogq $226, %zmm13, %zmm23, %zmm2
+; AVX512DQ-FAST-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
+; AVX512DQ-FAST-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm20 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512DQ-FAST-NEXT:    vpternlogd $226, 124(%r8){1to8}, %ymm4, %ymm0
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm9
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm0
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm15[0,1,2,3],zmm0[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpternlogq $184, %zmm0, %zmm4, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm13 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm18, %zmm19, %zmm0
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm8, %zmm17, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
+; AVX512DQ-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm7, %zmm4
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm12, %zmm21, %zmm0
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm11, %zmm16, %zmm8
+; AVX512DQ-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm7, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpternlogq $184, %zmm4, %zmm0, %zmm21
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpternlogq $184, %zmm8, %zmm0, %zmm17
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm30, %zmm31, %zmm0
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm28, %zmm29, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
+; AVX512DQ-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm7, %zmm4
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm26, %zmm27, %zmm0
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm24, %zmm25, %zmm8
+; AVX512DQ-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm7, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpternlogq $184, %zmm4, %zmm0, %zmm26
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpternlogq $184, %zmm8, %zmm0, %zmm29
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm3, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm0 = mem[0,2,2,3]
+; AVX512DQ-FAST-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm7 = mem[0,1,1,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm7 = mem[2,1,3,3]
+; AVX512DQ-FAST-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm8 = mem[0,0,1,1]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm8 = mem[0,0,2,1]
+; AVX512DQ-FAST-NEXT:    vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm27 = mem[0,0,1,3]
+; AVX512DQ-FAST-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm24 = mem[0,0,1,1]
+; AVX512DQ-FAST-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # xmm11 = mem[0,2,3,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
+; AVX512DQ-FAST-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm12 = mem[0,2,2,3]
+; AVX512DQ-FAST-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm25 = mem[0,1,1,3]
+; AVX512DQ-FAST-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm15 = mem[2,1,3,3]
+; AVX512DQ-FAST-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm16 = mem[0,0,1,1]
+; AVX512DQ-FAST-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm28 = mem[0,0,2,1]
+; AVX512DQ-FAST-NEXT:    vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm18 = mem[0,0,1,3]
+; AVX512DQ-FAST-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm19 = mem[0,0,1,1]
+; AVX512DQ-FAST-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # xmm14 = mem[0,2,3,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
+; AVX512DQ-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm3, %zmm7
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm25, %zmm12, %zmm0
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm16, %zmm15, %zmm12
+; AVX512DQ-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm3, %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpternlogq $184, %zmm7, %zmm0, %zmm15
+; AVX512DQ-FAST-NEXT:    vpternlogq $184, %zmm12, %zmm0, %zmm22
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm27, %zmm8, %zmm0
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm11, %zmm24, %zmm3
+; AVX512DQ-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm23, %zmm3
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm18, %zmm28, %zmm0
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm14, %zmm19, %zmm7
+; AVX512DQ-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm23, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpternlogq $184, %zmm3, %zmm0, %zmm8
+; AVX512DQ-FAST-NEXT:    vpternlogq $184, %zmm7, %zmm0, %zmm10
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm5
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm13
+; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, 320(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, 256(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, 192(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, 128(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, (%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, 448(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, 704(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm15, 640(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, 576(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, 512(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, 384(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, 768(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, 832(%rax)
+; AVX512DQ-FAST-NEXT:    addq $2200, %rsp # imm = 0x898
+; AVX512DQ-FAST-NEXT:    vzeroupper
+; AVX512DQ-FAST-NEXT:    retq
 ;
 ; AVX512BW-LABEL: store_i16_stride7_vf64:
 ; AVX512BW:       # %bb.0:
@@ -12391,11 +13464,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512: {{.*}}
 ; AVX512BW-ONLY-FAST: {{.*}}
 ; AVX512BW-ONLY-SLOW: {{.*}}
-; AVX512DQ-FAST: {{.*}}
 ; AVX512DQ-SLOW: {{.*}}
 ; AVX512DQBW-FAST: {{.*}}
 ; AVX512DQBW-SLOW: {{.*}}
-; AVX512F-ONLY-FAST: {{.*}}
 ; AVX512F-ONLY-SLOW: {{.*}}
 ; FALLBACK0: {{.*}}
 ; FALLBACK1: {{.*}}

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
index 5d478ae0f3e25..4c7b0bcdc11c1 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
@@ -619,10 +619,12 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpunpckhdq {{.*#+}} ymm9 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7]
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7]
 ; AVX2-FAST-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <6,u,u,u,u,u,7,u>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [6,0,7,0,6,0,7,0]
+; AVX2-FAST-NEXT:    # ymm10 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm10, %ymm10
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5],ymm10[6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,6,u,u,u,u,u,7>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [0,6,0,7,0,6,0,7]
+; AVX2-FAST-NEXT:    # ymm10 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm5, %ymm10, %ymm5
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2,3,4,5,6],ymm5[7]
 ; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5]

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
index ab3122960f53c..8d92086c44346 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
@@ -1177,9 +1177,11 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    vinserti64x4 $1, (%rsi), %zmm0, %zmm4
 ; AVX512F-NEXT:    vinserti64x4 $1, (%rcx), %zmm1, %zmm5
 ; AVX512F-NEXT:    vinserti64x4 $1, (%r9), %zmm2, %zmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,7,15,23,31,u,u,u>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [31,7,15,23,31,7,15,23]
+; AVX512F-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpermi2d %zmm5, %zmm4, %zmm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = <6,u,u,u,u,23,31,7>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,23,31,7,6,23,31,7]
+; AVX512F-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpermi2d %zmm2, %zmm3, %zmm1
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,u,u,u,0,8,16,u,u,u,u,1,9,17,u,u>
 ; AVX512F-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
@@ -1221,9 +1223,11 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vinserti64x4 $1, (%rsi), %zmm0, %zmm4
 ; AVX512BW-NEXT:    vinserti64x4 $1, (%rcx), %zmm1, %zmm5
 ; AVX512BW-NEXT:    vinserti64x4 $1, (%r9), %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,7,15,23,31,u,u,u>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [31,7,15,23,31,7,15,23]
+; AVX512BW-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = <6,u,u,u,u,23,31,7>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,23,31,7,6,23,31,7]
+; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm3, %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,u,u,u,0,8,16,u,u,u,u,1,9,17,u,u>
 ; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
index c94df69efc80d..664be09b5118a 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
@@ -617,7 +617,8 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13]
 ; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermi2q %zmm6, %zmm4, %zmm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,4,12>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12]
+; AVX512F-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpermi2q %zmm3, %zmm2, %zmm5
 ; AVX512F-NEXT:    movb $12, %r10b
 ; AVX512F-NEXT:    kmovw %r10d, %k1
@@ -663,7 +664,8 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15]
 ; AVX512F-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermi2q %zmm3, %zmm2, %zmm7
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,7,15>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [7,15,7,15]
+; AVX512F-NEXT:    # ymm11 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpermi2q %zmm6, %zmm4, %zmm11
 ; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm11[0,1,2,3],zmm7[4,5,6,7]
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <14,u,2,3,4,5,15,u>
@@ -702,7 +704,8 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13]
 ; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm4, %zmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,4,12>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12]
+; AVX512BW-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm5
 ; AVX512BW-NEXT:    movb $12, %r10b
 ; AVX512BW-NEXT:    kmovd %r10d, %k1
@@ -748,7 +751,8 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15]
 ; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm7
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,7,15>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [7,15,7,15]
+; AVX512BW-NEXT:    # ymm11 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm4, %zmm11
 ; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm11[0,1,2,3],zmm7[4,5,6,7]
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <14,u,2,3,4,5,15,u>
@@ -1462,281 +1466,1125 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;
-; AVX512F-LABEL: store_i64_stride6_vf16:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm13
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm5
-; AVX512F-NEXT:    vmovdqa64 (%rsi), %zmm6
-; AVX512F-NEXT:    vmovdqa64 64(%rsi), %zmm14
-; AVX512F-NEXT:    vmovdqa64 64(%rdx), %zmm7
-; AVX512F-NEXT:    vmovdqa64 (%rdx), %zmm2
-; AVX512F-NEXT:    vmovdqa64 (%rcx), %zmm4
-; AVX512F-NEXT:    vmovdqa64 64(%rcx), %zmm11
-; AVX512F-NEXT:    vmovdqa64 (%r8), %zmm3
-; AVX512F-NEXT:    vmovdqa64 64(%r8), %zmm8
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13]
-; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm0, %zmm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,4,12>
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm9, %zmm10
-; AVX512F-NEXT:    movb $12, %r10b
-; AVX512F-NEXT:    kmovw %r10d, %k1
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm1 {%k1}
-; AVX512F-NEXT:    movb $16, %r10b
-; AVX512F-NEXT:    kmovw %r10d, %k2
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm1 {%k2}
-; AVX512F-NEXT:    vpermi2q %zmm14, %zmm13, %zmm0
-; AVX512F-NEXT:    vpermi2q %zmm11, %zmm7, %zmm9
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm0 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k2}
-; AVX512F-NEXT:    vmovdqa64 (%r9), %zmm10
-; AVX512F-NEXT:    vmovdqa64 64(%r9), %zmm16
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14]
-; AVX512F-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm15
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm18, %zmm15
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14]
-; AVX512F-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm9
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm12, %zmm9
-; AVX512F-NEXT:    movb $48, %r9b
-; AVX512F-NEXT:    kmovw %r9d, %k2
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm9 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7>
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm19, %zmm9
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7]
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm20, %zmm9
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10]
-; AVX512F-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm22
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm21, %zmm22
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10]
-; AVX512F-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm15
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm17, %zmm15
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm15 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7>
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm22, %zmm15
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7]
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm23, %zmm15
-; AVX512F-NEXT:    vpermi2q %zmm6, %zmm5, %zmm18
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm2, %zmm12
-; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm12 {%k2}
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm19, %zmm12
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm20, %zmm12
-; AVX512F-NEXT:    vpermi2q %zmm6, %zmm5, %zmm21
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm2, %zmm17
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm17 {%k2}
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm22, %zmm17
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm23, %zmm17
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9]
-; AVX512F-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm18
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm19, %zmm18
-; AVX512F-NEXT:    vmovdqa64 (%rdx), %xmm20
-; AVX512F-NEXT:    vmovdqa64 64(%rdx), %xmm21
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm20
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1}
-; AVX512F-NEXT:    vinserti32x4 $2, (%r8), %zmm18, %zmm18
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7]
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm20, %zmm18
-; AVX512F-NEXT:    vpermi2q %zmm14, %zmm13, %zmm19
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm21, %ymm0, %ymm21
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1}
-; AVX512F-NEXT:    vinserti32x4 $2, 64(%r8), %zmm19, %zmm19
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm20, %zmm19
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15]
-; AVX512F-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm21
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm20, %zmm21
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm22 = <u,u,7,15>
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm22, %zmm13
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u>
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm14, %zmm13
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15]
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm21, %zmm13
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7]
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm23, %zmm0
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11]
-; AVX512F-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm24, %zmm7
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm11
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %ymm25
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm25, %zmm7, %zmm7
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u>
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm25, %zmm7
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11]
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm8, %zmm7
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm2, %zmm20
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm22, %zmm5
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7]
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm14, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm21, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm23, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm24, %zmm2
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm25, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm8, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm17, 64(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm2, 128(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm1, 192(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm12, 256(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm5, 320(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm15, 448(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm7, 512(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm0, 576(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm9, 640(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm13, 704(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm19, 384(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm18, (%rax)
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
+; AVX512F-ONLY-SLOW-LABEL: store_i64_stride6_vf16:
+; AVX512F-ONLY-SLOW:       # %bb.0:
+; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm9, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    movb $12, %r10b
+; AVX512F-ONLY-SLOW-NEXT:    kmovw %r10d, %k1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    movb $16, %r10b
+; AVX512F-ONLY-SLOW-NEXT:    kmovw %r10d, %k2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm1 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm14, %zmm13, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm11, %zmm7, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm0 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm18, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm12, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    movb $48, %r9b
+; AVX512F-ONLY-SLOW-NEXT:    kmovw %r9d, %k2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm9 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7>
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm19, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm20, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm21, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm17, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm15 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7>
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm22, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm23, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm5, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm12 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm19, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm20, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm5, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm17 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm22, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm23, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm19, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %xmm20
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %xmm21
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm20
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $2, (%r8), %zmm18, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm20, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm14, %zmm13, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm21, %ymm0, %ymm21
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $2, 64(%r8), %zmm19, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm20, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm20, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm22 = [7,15,7,15]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm22 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm22, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u>
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm14, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm21, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm23, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm24, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %ymm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %ymm25
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm25, %zmm7, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u>
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm25, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm8, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm22, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm21, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm23, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm24, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm25, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm8, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 128(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, 256(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 320(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, 448(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, 512(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 576(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 640(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, 704(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, 384(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, (%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vzeroupper
+; AVX512F-ONLY-SLOW-NEXT:    retq
 ;
-; AVX512BW-LABEL: store_i64_stride6_vf16:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm13
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm6
-; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm14
-; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm7
-; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm4
-; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm11
-; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm3
-; AVX512BW-NEXT:    vmovdqa64 64(%r8), %zmm8
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,4,12>
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm9, %zmm10
-; AVX512BW-NEXT:    movb $12, %r10b
-; AVX512BW-NEXT:    kmovd %r10d, %k1
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm1 {%k1}
-; AVX512BW-NEXT:    movb $16, %r10b
-; AVX512BW-NEXT:    kmovd %r10d, %k2
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm1 {%k2}
-; AVX512BW-NEXT:    vpermi2q %zmm14, %zmm13, %zmm0
-; AVX512BW-NEXT:    vpermi2q %zmm11, %zmm7, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm0 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm10
-; AVX512BW-NEXT:    vmovdqa64 64(%r9), %zmm16
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm18, %zmm15
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14]
-; AVX512BW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm9
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm12, %zmm9
-; AVX512BW-NEXT:    movb $48, %r9b
-; AVX512BW-NEXT:    kmovd %r9d, %k2
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm9 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7>
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm19, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7]
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm20, %zmm9
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm22
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm21, %zmm22
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10]
-; AVX512BW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm17, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm15 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7>
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm22, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7]
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm23, %zmm15
-; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm5, %zmm18
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm12 {%k2}
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm19, %zmm12
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm20, %zmm12
-; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm5, %zmm21
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm17
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm17 {%k2}
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm22, %zmm17
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm23, %zmm17
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9]
-; AVX512BW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm18
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm19, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 (%rdx), %xmm20
-; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %xmm21
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm20
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1}
-; AVX512BW-NEXT:    vinserti32x4 $2, (%r8), %zmm18, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7]
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm20, %zmm18
-; AVX512BW-NEXT:    vpermi2q %zmm14, %zmm13, %zmm19
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm21, %ymm0, %ymm21
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1}
-; AVX512BW-NEXT:    vinserti32x4 $2, 64(%r8), %zmm19, %zmm19
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm20, %zmm19
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm21
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm20, %zmm21
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm22 = <u,u,7,15>
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm22, %zmm13
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u>
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm14, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15]
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm21, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7]
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm23, %zmm0
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm24, %zmm7
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm11
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %ymm25
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm25, %zmm7, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u>
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm25, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11]
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm8, %zmm7
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm20
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm22, %zmm5
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7]
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm21, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm23, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm24, %zmm2
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm25, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm8, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, 128(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, 192(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, 256(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, 320(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, 448(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, 512(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, 576(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, 640(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, 704(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, 384(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, (%rax)
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX512F-ONLY-FAST-LABEL: store_i64_stride6_vf16:
+; AVX512F-ONLY-FAST:       # %bb.0:
+; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13]
+; AVX512F-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12]
+; AVX512F-ONLY-FAST-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm9, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    movb $12, %r10b
+; AVX512F-ONLY-FAST-NEXT:    kmovw %r10d, %k1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    movb $16, %r10b
+; AVX512F-ONLY-FAST-NEXT:    kmovw %r10d, %k2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm14, %zmm13, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm11, %zmm7, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm0 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm18, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14]
+; AVX512F-ONLY-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm12, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    movb $48, %r9b
+; AVX512F-ONLY-FAST-NEXT:    kmovw %r9d, %k2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm9 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7>
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm19, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm20, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10]
+; AVX512F-ONLY-FAST-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm21, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10]
+; AVX512F-ONLY-FAST-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm17, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm15 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7>
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm22, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm23, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm6, %zmm5, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm2, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm12 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm19, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm20, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm6, %zmm5, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm2, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm17 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm22, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm23, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9]
+; AVX512F-ONLY-FAST-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm19, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %xmm20
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %xmm21
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm20
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $2, (%r8), %zmm18, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm20, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm14, %zmm13, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm21, %ymm0, %ymm21
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $2, 64(%r8), %zmm19, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm20, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15]
+; AVX512F-ONLY-FAST-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm20, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm22 = [7,15,7,15]
+; AVX512F-ONLY-FAST-NEXT:    # ymm22 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm22, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u>
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm14, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm21, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm23, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11]
+; AVX512F-ONLY-FAST-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm24, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdi), %ymm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %ymm25
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm25, %zmm7, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u>
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm25, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm8, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm2, %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm22, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm21, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm23, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm24, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm25, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm8, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, 256(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, 320(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, 448(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, 512(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 576(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 640(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, 704(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, 384(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, (%rax)
+; AVX512F-ONLY-FAST-NEXT:    vzeroupper
+; AVX512F-ONLY-FAST-NEXT:    retq
+;
+; AVX512DQ-SLOW-LABEL: store_i64_stride6_vf16:
+; AVX512DQ-SLOW:       # %bb.0:
+; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r8), %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm8
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13]
+; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12]
+; AVX512DQ-SLOW-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm9, %zmm10
+; AVX512DQ-SLOW-NEXT:    movb $12, %r10b
+; AVX512DQ-SLOW-NEXT:    kmovw %r10d, %k1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT:    movb $16, %r10b
+; AVX512DQ-SLOW-NEXT:    kmovw %r10d, %k2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm1 {%k2}
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm14, %zmm13, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm11, %zmm7, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm0 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r9), %zmm10
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm16
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-SLOW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm14, %zmm18, %zmm15
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14]
+; AVX512DQ-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm12, %zmm9
+; AVX512DQ-SLOW-NEXT:    movb $48, %r9b
+; AVX512DQ-SLOW-NEXT:    kmovw %r9d, %k2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm9 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7>
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm19, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm20, %zmm9
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10]
+; AVX512DQ-SLOW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm22
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm14, %zmm21, %zmm22
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10]
+; AVX512DQ-SLOW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm15
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm17, %zmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm15 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7>
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm22, %zmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm23, %zmm15
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm6, %zmm5, %zmm18
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm12
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm12 {%k2}
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm19, %zmm12
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm20, %zmm12
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm6, %zmm5, %zmm21
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm17 {%k2}
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm22, %zmm17
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm23, %zmm17
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9]
+; AVX512DQ-SLOW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm18
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm19, %zmm18
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdx), %xmm20
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdx), %xmm21
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm20
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $2, (%r8), %zmm18, %zmm18
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm20, %zmm18
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm14, %zmm13, %zmm19
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm21, %ymm0, %ymm21
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $2, 64(%r8), %zmm19, %zmm19
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm20, %zmm19
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15]
+; AVX512DQ-SLOW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm21
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm20, %zmm21
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm22 = [7,15,7,15]
+; AVX512DQ-SLOW-NEXT:    # ymm22 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm14, %zmm22, %zmm13
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u>
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm14, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm21, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm23, %zmm0
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11]
+; AVX512DQ-SLOW-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm24, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdi), %ymm11
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdi), %ymm25
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm25, %zmm7, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u>
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm25, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm8, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm20
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm22, %zmm5
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm21, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm23, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm24, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm25, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm8, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, 64(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, 128(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, 256(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, 320(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, 448(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, 512(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, 576(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, 640(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, 704(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, 384(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, (%rax)
+; AVX512DQ-SLOW-NEXT:    vzeroupper
+; AVX512DQ-SLOW-NEXT:    retq
+;
+; AVX512DQ-FAST-LABEL: store_i64_stride6_vf16:
+; AVX512DQ-FAST:       # %bb.0:
+; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdi), %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rsi), %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdx), %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rcx), %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r8), %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r8), %zmm8
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13]
+; AVX512DQ-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12]
+; AVX512DQ-FAST-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm9, %zmm10
+; AVX512DQ-FAST-NEXT:    movb $12, %r10b
+; AVX512DQ-FAST-NEXT:    kmovw %r10d, %k1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT:    movb $16, %r10b
+; AVX512DQ-FAST-NEXT:    kmovw %r10d, %k2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1 {%k2}
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm14, %zmm13, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm11, %zmm7, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm0 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r9), %zmm10
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r9), %zmm16
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-FAST-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm14, %zmm18, %zmm15
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14]
+; AVX512DQ-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm12, %zmm9
+; AVX512DQ-FAST-NEXT:    movb $48, %r9b
+; AVX512DQ-FAST-NEXT:    kmovw %r9d, %k2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm15, %zmm9 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7>
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm19, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm20, %zmm9
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10]
+; AVX512DQ-FAST-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm22
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm14, %zmm21, %zmm22
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10]
+; AVX512DQ-FAST-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm15
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm17, %zmm15
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, %zmm15 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7>
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm22, %zmm15
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm23, %zmm15
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm6, %zmm5, %zmm18
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm2, %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm12 {%k2}
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm19, %zmm12
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm20, %zmm12
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm6, %zmm5, %zmm21
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm2, %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm17 {%k2}
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm22, %zmm17
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm23, %zmm17
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9]
+; AVX512DQ-FAST-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm18
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm19, %zmm18
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdx), %xmm20
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdx), %xmm21
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm20
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $2, (%r8), %zmm18, %zmm18
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm20, %zmm18
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm14, %zmm13, %zmm19
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm21, %ymm0, %ymm21
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $2, 64(%r8), %zmm19, %zmm19
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm20, %zmm19
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15]
+; AVX512DQ-FAST-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm21
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm20, %zmm21
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm22 = [7,15,7,15]
+; AVX512DQ-FAST-NEXT:    # ymm22 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm14, %zmm22, %zmm13
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u>
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm14, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm21, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm23, %zmm0
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11]
+; AVX512DQ-FAST-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm24, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %ymm11
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %ymm25
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm25, %zmm7, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u>
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm25, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm8, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm2, %zmm20
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm22, %zmm5
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm21, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm23, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm24, %zmm2
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm25, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm8, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, 64(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, 128(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, 256(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, 320(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm15, 448(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, 512(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, 576(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, 640(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, 704(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, 384(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, (%rax)
+; AVX512DQ-FAST-NEXT:    vzeroupper
+; AVX512DQ-FAST-NEXT:    retq
+;
+; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride6_vf16:
+; AVX512BW-ONLY-SLOW:       # %bb.0:
+; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm9, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    movb $12, %r10b
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %r10d, %k1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    movb $16, %r10b
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %r10d, %k2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm1 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm14, %zmm13, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm11, %zmm7, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm0 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm18, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm12, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    movb $48, %r9b
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %r9d, %k2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm9 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm19, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm20, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm21, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm17, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm15 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm22, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm23, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm5, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm12 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm19, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm20, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm5, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm17 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm22, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm23, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm19, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %xmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %xmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm20
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $2, (%r8), %zmm18, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm20, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm14, %zmm13, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm21, %ymm0, %ymm21
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $2, 64(%r8), %zmm19, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm20, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm20, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm22 = [7,15,7,15]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm22 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm22, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm14, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm21, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm23, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm24, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %ymm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %ymm25
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm25, %zmm7, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm25, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm8, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm22, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm21, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm23, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm24, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm25, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm8, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, 256(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 320(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, 448(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, 512(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 576(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 640(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, 704(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, 384(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vzeroupper
+; AVX512BW-ONLY-SLOW-NEXT:    retq
+;
+; AVX512BW-ONLY-FAST-LABEL: store_i64_stride6_vf16:
+; AVX512BW-ONLY-FAST:       # %bb.0:
+; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm9, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    movb $12, %r10b
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %r10d, %k1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    movb $16, %r10b
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %r10d, %k2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm14, %zmm13, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm11, %zmm7, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm0 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm18, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm12, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    movb $48, %r9b
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %r9d, %k2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm9 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7>
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm19, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm20, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm21, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm17, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm15 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7>
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm22, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm23, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm6, %zmm5, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm2, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm12 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm19, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm20, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm6, %zmm5, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm2, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm17 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm22, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm23, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm19, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %xmm20
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %xmm21
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm20
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $2, (%r8), %zmm18, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm20, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm14, %zmm13, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm21, %ymm0, %ymm21
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $2, 64(%r8), %zmm19, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm20, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm20, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm22 = [7,15,7,15]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm22 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm22, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u>
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm14, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm21, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm23, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm24, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdi), %ymm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %ymm25
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm25, %zmm7, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u>
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm25, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm8, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm2, %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm22, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm21, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm23, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm24, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm25, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm8, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 128(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, 256(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, 320(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, 448(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, 512(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 576(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 640(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, 704(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, 384(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, (%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vzeroupper
+; AVX512BW-ONLY-FAST-NEXT:    retq
+;
+; AVX512DQBW-SLOW-LABEL: store_i64_stride6_vf16:
+; AVX512DQBW-SLOW:       # %bb.0:
+; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r8), %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm8
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13]
+; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12]
+; AVX512DQBW-SLOW-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm9, %zmm10
+; AVX512DQBW-SLOW-NEXT:    movb $12, %r10b
+; AVX512DQBW-SLOW-NEXT:    kmovd %r10d, %k1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT:    movb $16, %r10b
+; AVX512DQBW-SLOW-NEXT:    kmovd %r10d, %k2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm1 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm14, %zmm13, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm11, %zmm7, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm0 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r9), %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm16
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-SLOW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm14, %zmm18, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14]
+; AVX512DQBW-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm12, %zmm9
+; AVX512DQBW-SLOW-NEXT:    movb $48, %r9b
+; AVX512DQBW-SLOW-NEXT:    kmovd %r9d, %k2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm9 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7>
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm19, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm20, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10]
+; AVX512DQBW-SLOW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm14, %zmm21, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10]
+; AVX512DQBW-SLOW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm17, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm15 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7>
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm22, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm23, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm6, %zmm5, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm12 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm19, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm20, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm6, %zmm5, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm17 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm22, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm23, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9]
+; AVX512DQBW-SLOW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm19, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdx), %xmm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdx), %xmm21
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm20
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $2, (%r8), %zmm18, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm20, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm14, %zmm13, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm21, %ymm0, %ymm21
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $2, 64(%r8), %zmm19, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm20, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15]
+; AVX512DQBW-SLOW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm20, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm22 = [7,15,7,15]
+; AVX512DQBW-SLOW-NEXT:    # ymm22 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm14, %zmm22, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u>
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm14, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm21, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm23, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11]
+; AVX512DQBW-SLOW-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm24, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdi), %ymm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %ymm25
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm25, %zmm7, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u>
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm25, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm8, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm22, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm21, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm23, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm24, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm25, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm8, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, 64(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, 128(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, 256(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, 320(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, 448(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, 512(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, 576(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, 640(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, 704(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, 384(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, (%rax)
+; AVX512DQBW-SLOW-NEXT:    vzeroupper
+; AVX512DQBW-SLOW-NEXT:    retq
+;
+; AVX512DQBW-FAST-LABEL: store_i64_stride6_vf16:
+; AVX512DQBW-FAST:       # %bb.0:
+; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rsi), %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdx), %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rcx), %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r8), %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r8), %zmm8
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13]
+; AVX512DQBW-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12]
+; AVX512DQBW-FAST-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm9, %zmm10
+; AVX512DQBW-FAST-NEXT:    movb $12, %r10b
+; AVX512DQBW-FAST-NEXT:    kmovd %r10d, %k1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT:    movb $16, %r10b
+; AVX512DQBW-FAST-NEXT:    kmovd %r10d, %k2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1 {%k2}
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm14, %zmm13, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm11, %zmm7, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm0 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r9), %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r9), %zmm16
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-FAST-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm14, %zmm18, %zmm15
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14]
+; AVX512DQBW-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm12, %zmm9
+; AVX512DQBW-FAST-NEXT:    movb $48, %r9b
+; AVX512DQBW-FAST-NEXT:    kmovd %r9d, %k2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm15, %zmm9 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7>
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm19, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm20, %zmm9
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10]
+; AVX512DQBW-FAST-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm22
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm14, %zmm21, %zmm22
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10]
+; AVX512DQBW-FAST-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm15
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm17, %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, %zmm15 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7>
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm22, %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm23, %zmm15
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm6, %zmm5, %zmm18
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm2, %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm12 {%k2}
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm19, %zmm12
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm20, %zmm12
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm6, %zmm5, %zmm21
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm2, %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm17 {%k2}
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm22, %zmm17
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm23, %zmm17
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9]
+; AVX512DQBW-FAST-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm18
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm19, %zmm18
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdx), %xmm20
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdx), %xmm21
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm20
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $2, (%r8), %zmm18, %zmm18
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm20, %zmm18
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm14, %zmm13, %zmm19
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm21, %ymm0, %ymm21
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $2, 64(%r8), %zmm19, %zmm19
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm20, %zmm19
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15]
+; AVX512DQBW-FAST-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm21
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm20, %zmm21
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm22 = [7,15,7,15]
+; AVX512DQBW-FAST-NEXT:    # ymm22 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm14, %zmm22, %zmm13
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u>
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm14, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm21, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm23, %zmm0
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11]
+; AVX512DQBW-FAST-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm24, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdi), %ymm11
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdi), %ymm25
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm25, %zmm7, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u>
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm25, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm8, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm2, %zmm20
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm22, %zmm5
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm21, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm23, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm24, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm25, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm8, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, 64(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, 128(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, 256(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, 320(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm15, 448(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, 512(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, 576(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, 640(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, 704(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, 384(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, (%rax)
+; AVX512DQBW-FAST-NEXT:    vzeroupper
+; AVX512DQBW-FAST-NEXT:    retq
   %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64
   %in.vec1 = load <16 x i64>, ptr %in.vecptr1, align 64
   %in.vec2 = load <16 x i64>, ptr %in.vecptr2, align 64
@@ -3204,565 +4052,2261 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;
-; AVX512F-LABEL: store_i64_stride6_vf32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    subq $712, %rsp # imm = 0x2C8
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm30
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm0
-; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm14
-; AVX512F-NEXT:    vmovdqa64 (%rsi), %zmm17
-; AVX512F-NEXT:    vmovdqa64 64(%rsi), %zmm21
-; AVX512F-NEXT:    vmovdqa64 128(%rsi), %zmm8
-; AVX512F-NEXT:    vmovdqa64 192(%rsi), %zmm10
-; AVX512F-NEXT:    vmovdqa64 (%rdx), %zmm6
-; AVX512F-NEXT:    vmovdqa64 64(%rdx), %zmm3
-; AVX512F-NEXT:    vmovdqa64 128(%rdx), %zmm2
-; AVX512F-NEXT:    vmovdqa64 192(%rdx), %zmm15
-; AVX512F-NEXT:    vmovdqa64 (%rcx), %zmm28
-; AVX512F-NEXT:    vmovdqa64 64(%rcx), %zmm27
-; AVX512F-NEXT:    vmovdqa64 128(%rcx), %zmm24
-; AVX512F-NEXT:    vmovdqa64 192(%rcx), %zmm20
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13]
-; AVX512F-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm16 = <u,u,4,12>
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm16, %zmm11
-; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm16, %zmm11
-; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm16, %zmm11
-; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm28, %zmm6, %zmm16
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10]
-; AVX512F-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm19, %zmm11
-; AVX512F-NEXT:    vmovdqu64 %zmm11, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14]
-; AVX512F-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm18, %zmm11
-; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm19, %zmm11
-; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm22
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm18, %zmm22
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm25
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm19, %zmm25
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm26
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm18, %zmm26
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11]
-; AVX512F-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm29, %zmm11
-; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15]
-; AVX512F-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm31, %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm28
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm29, %zmm28
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm31, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm27
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm29, %zmm27
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm31, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm24
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm9, %zmm24
-; AVX512F-NEXT:    vpermi2q %zmm20, %zmm15, %zmm19
-; AVX512F-NEXT:    vpermi2q %zmm20, %zmm15, %zmm18
-; AVX512F-NEXT:    vpermi2q %zmm20, %zmm15, %zmm29
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm31, %zmm15
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm31
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm9, %zmm31
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm20
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm9, %zmm20
-; AVX512F-NEXT:    vpermi2q %zmm17, %zmm30, %zmm9
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10]
-; AVX512F-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm1, %zmm7
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14]
-; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm13
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm2, %zmm13
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9]
-; AVX512F-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm3, %zmm11
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,7,15>
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm12, %zmm30
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm17
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm1, %zmm17
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm21
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm2, %zmm21
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm23
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm3, %zmm23
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm1, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm2, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm3, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm12, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm10, %zmm14, %zmm1
-; AVX512F-NEXT:    vpermi2q %zmm10, %zmm14, %zmm2
-; AVX512F-NEXT:    vpermi2q %zmm10, %zmm14, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm12, %zmm14
-; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    movb $12, %al
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm9 {%k1}
-; AVX512F-NEXT:    movb $48, %al
-; AVX512F-NEXT:    kmovw %eax, %k2
-; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm12 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm8 {%k2}
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k2}
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm25 {%k2}
-; AVX512F-NEXT:    vmovdqa64 (%r8), %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm26 {%k2}
-; AVX512F-NEXT:    vmovdqa64 64(%r8), %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm19 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7>
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm0, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm0, %zmm8
-; AVX512F-NEXT:    vmovdqa64 128(%r8), %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm0, %zmm25
-; AVX512F-NEXT:    vmovdqa64 192(%r8), %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm0, %zmm19
-; AVX512F-NEXT:    vmovdqa64 (%r9), %zmm0
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7]
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm13, %zmm10
-; AVX512F-NEXT:    vmovdqu64 %zmm10, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 64(%r9), %zmm16
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm13, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 128(%r9), %zmm17
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm13, %zmm25
-; AVX512F-NEXT:    vmovdqa64 192(%r9), %zmm21
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm13, %zmm19
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm18 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7>
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm2, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm2, %zmm22
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm26
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm2, %zmm18
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7]
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm2, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm2, %zmm22
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm2, %zmm26
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm2, %zmm18
-; AVX512F-NEXT:    vmovdqa (%rdx), %xmm2
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1}
-; AVX512F-NEXT:    vmovdqa 64(%rdx), %xmm2
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1}
-; AVX512F-NEXT:    vmovdqa 128(%rdx), %xmm2
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1}
-; AVX512F-NEXT:    vmovdqa 192(%rdx), %xmm2
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1}
-; AVX512F-NEXT:    vinserti32x4 $2, (%r8), %zmm11, %zmm8
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7]
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm11, %zmm8
-; AVX512F-NEXT:    vinserti32x4 $2, 64(%r8), %zmm23, %zmm13
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm11, %zmm13
-; AVX512F-NEXT:    vinserti32x4 $2, 128(%r8), %zmm6, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm11, %zmm2
-; AVX512F-NEXT:    vinserti32x4 $2, 192(%r8), %zmm3, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm11, %zmm10
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm11
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm11, %zmm3, %zmm11
-; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    movb $16, %al
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm9 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u>
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm23, %zmm11
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u>
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm30, %zmm12
-; AVX512F-NEXT:    vmovdqa 64(%rdi), %ymm3
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm28, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm20 {%k1}
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm23, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm30, %zmm14
-; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm4
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm27, %zmm4
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm31 {%k1}
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm23, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm30, %zmm27
-; AVX512F-NEXT:    vmovdqa 192(%rdi), %ymm6
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm29, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm23, %zmm6
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm24 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11]
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm28, %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm30, %zmm23
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7]
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm7, %zmm9
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15]
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm29, %zmm12
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm28, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm7, %zmm20
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm29, %zmm14
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm28, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm7, %zmm31
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm29, %zmm27
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm28, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm7, %zmm24
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm29, %zmm23
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm23, 1472(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm18, 1408(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm24, 1344(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm6, 1280(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm19, 1216(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm27, 1088(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm26, 1024(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm31, 960(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm4, 896(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm25, 832(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm14, 704(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm22, 640(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm20, 576(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm3, 512(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm12, 320(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm9, 192(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm11, 128(%rax)
-; AVX512F-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm10, 1152(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm2, 768(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm13, 384(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm8, (%rax)
-; AVX512F-NEXT:    addq $712, %rsp # imm = 0x2C8
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
+; AVX512F-ONLY-SLOW-LABEL: store_i64_stride6_vf32:
+; AVX512F-ONLY-SLOW:       # %bb.0:
+; AVX512F-ONLY-SLOW-NEXT:    subq $712, %rsp # imm = 0x2C8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm30
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdx), %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdx), %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm16 = [4,12,4,12]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm16, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm16, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm16, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm28, %zmm6, %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm19, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm18, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm19, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm18, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm19, %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm18, %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm29, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm31, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm29, %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm31, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm29, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm31, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm9, %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm20, %zmm15, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm20, %zmm15, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm20, %zmm15, %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm31, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm31
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm9, %zmm31
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm9, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm17, %zmm30, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm1, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm2, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm3, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm12 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm12, %zmm30
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm1, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm2, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm3, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm12, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm10, %zmm14, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm10, %zmm14, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm10, %zmm14, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm12, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    movb $12, %al
+; AVX512F-ONLY-SLOW-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    movb $48, %al
+; AVX512F-ONLY-SLOW-NEXT:    kmovw %eax, %k2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm12 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm8 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm25 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm26 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm19 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7>
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm13, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm13, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm13, %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm13, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm18 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm2, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm2, %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm2, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rdx), %xmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $2, (%r8), %zmm11, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $2, 64(%r8), %zmm23, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm11, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $2, 128(%r8), %zmm6, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm11, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $2, 192(%r8), %zmm3, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm11, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %ymm11
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm3, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    movb $16, %al
+; AVX512F-ONLY-SLOW-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u>
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm23, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u>
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm30, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm3
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm28, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm23, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm30, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rdi), %ymm4
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm27, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm31 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm23, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm30, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm6
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm29, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm23, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm24 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm30, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm7, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm29, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm28, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm7, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm29, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm28, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm7, %zmm31
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm29, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm28, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm7, %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm29, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, 1472(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, 1408(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, 1344(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, 1280(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, 1216(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, 1088(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, 1024(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, 960(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 896(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, 832(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, 704(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, 640(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, 576(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 512(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, 320(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, 128(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 1152(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 768(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, 384(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, (%rax)
+; AVX512F-ONLY-SLOW-NEXT:    addq $712, %rsp # imm = 0x2C8
+; AVX512F-ONLY-SLOW-NEXT:    vzeroupper
+; AVX512F-ONLY-SLOW-NEXT:    retq
 ;
-; AVX512BW-LABEL: store_i64_stride6_vf32:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    subq $712, %rsp # imm = 0x2C8
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm30
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm14
-; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm17
-; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm21
-; AVX512BW-NEXT:    vmovdqa64 128(%rsi), %zmm8
-; AVX512BW-NEXT:    vmovdqa64 192(%rsi), %zmm10
-; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm6
-; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm3
-; AVX512BW-NEXT:    vmovdqa64 128(%rdx), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 192(%rdx), %zmm15
-; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm28
-; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm27
-; AVX512BW-NEXT:    vmovdqa64 128(%rcx), %zmm24
-; AVX512BW-NEXT:    vmovdqa64 192(%rcx), %zmm20
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13]
-; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm16 = <u,u,4,12>
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm16, %zmm11
-; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm16, %zmm11
-; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm16, %zmm11
-; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm28, %zmm6, %zmm16
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10]
-; AVX512BW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm19, %zmm11
-; AVX512BW-NEXT:    vmovdqu64 %zmm11, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14]
-; AVX512BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm18, %zmm11
-; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm19, %zmm11
-; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm22
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm18, %zmm22
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm25
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm19, %zmm25
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm26
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm18, %zmm26
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm29, %zmm11
-; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm31, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm28
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm29, %zmm28
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm31, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm27
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm29, %zmm27
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm31, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm24
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm9, %zmm24
-; AVX512BW-NEXT:    vpermi2q %zmm20, %zmm15, %zmm19
-; AVX512BW-NEXT:    vpermi2q %zmm20, %zmm15, %zmm18
-; AVX512BW-NEXT:    vpermi2q %zmm20, %zmm15, %zmm29
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm31, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm31
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm9, %zmm31
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm20
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm9, %zmm20
-; AVX512BW-NEXT:    vpermi2q %zmm17, %zmm30, %zmm9
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm1, %zmm7
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm2, %zmm13
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9]
-; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm3, %zmm11
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,7,15>
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm12, %zmm30
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm17
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm17
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm21
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm21
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm23
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm23
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm1, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm2, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm3, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm12, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm10, %zmm14, %zmm1
-; AVX512BW-NEXT:    vpermi2q %zmm10, %zmm14, %zmm2
-; AVX512BW-NEXT:    vpermi2q %zmm10, %zmm14, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm12, %zmm14
-; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    movb $12, %al
-; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm9 {%k1}
-; AVX512BW-NEXT:    movb $48, %al
-; AVX512BW-NEXT:    kmovd %eax, %k2
-; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm12 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm8 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm25 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm26 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 64(%r8), %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm19 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7>
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 128(%r8), %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm25
-; AVX512BW-NEXT:    vmovdqa64 192(%r8), %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm19
-; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7]
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm13, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 %zmm10, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 64(%r9), %zmm16
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm13, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 128(%r9), %zmm17
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm13, %zmm25
-; AVX512BW-NEXT:    vmovdqa64 192(%r9), %zmm21
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm13, %zmm19
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm18 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7>
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm22
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm26
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7]
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm2, %zmm22
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm2, %zmm26
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm2, %zmm18
-; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm2
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1}
-; AVX512BW-NEXT:    vmovdqa 64(%rdx), %xmm2
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1}
-; AVX512BW-NEXT:    vmovdqa 128(%rdx), %xmm2
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1}
-; AVX512BW-NEXT:    vmovdqa 192(%rdx), %xmm2
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1}
-; AVX512BW-NEXT:    vinserti32x4 $2, (%r8), %zmm11, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7]
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm8
-; AVX512BW-NEXT:    vinserti32x4 $2, 64(%r8), %zmm23, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm11, %zmm13
-; AVX512BW-NEXT:    vinserti32x4 $2, 128(%r8), %zmm6, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm11, %zmm2
-; AVX512BW-NEXT:    vinserti32x4 $2, 192(%r8), %zmm3, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm11, %zmm10
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm11
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm3, %zmm11
-; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    movb $16, %al
-; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm9 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u>
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm23, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u>
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm30, %zmm12
-; AVX512BW-NEXT:    vmovdqa 64(%rdi), %ymm3
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm28, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm20 {%k1}
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm23, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm30, %zmm14
-; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm4
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm27, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm31 {%k1}
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm23, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm30, %zmm27
-; AVX512BW-NEXT:    vmovdqa 192(%rdi), %ymm6
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm29, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm23, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm24 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11]
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm30, %zmm23
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7]
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm7, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15]
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm29, %zmm12
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm28, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm7, %zmm20
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm29, %zmm14
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm28, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm7, %zmm31
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm29, %zmm27
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm28, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm7, %zmm24
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm29, %zmm23
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, 1472(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, 1408(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, 1344(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, 1280(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, 1216(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, 1088(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, 1024(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, 960(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, 896(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, 832(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, 704(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, 640(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, 576(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, 512(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, 320(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, 192(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, 128(%rax)
-; AVX512BW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, 1152(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, 768(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, 384(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, (%rax)
-; AVX512BW-NEXT:    addq $712, %rsp # imm = 0x2C8
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX512F-ONLY-FAST-LABEL: store_i64_stride6_vf32:
+; AVX512F-ONLY-FAST:       # %bb.0:
+; AVX512F-ONLY-FAST-NEXT:    subq $712, %rsp # imm = 0x2C8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rdx), %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdx), %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13]
+; AVX512F-ONLY-FAST-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm16 = [4,12,4,12]
+; AVX512F-ONLY-FAST-NEXT:    # ymm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm16, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm16, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm16, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm28, %zmm6, %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10]
+; AVX512F-ONLY-FAST-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm19, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14]
+; AVX512F-ONLY-FAST-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm18, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm19, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm18, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm19, %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm18, %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11]
+; AVX512F-ONLY-FAST-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm29, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15]
+; AVX512F-ONLY-FAST-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm31, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm29, %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm31, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm29, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm31, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm9, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm20, %zmm15, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm20, %zmm15, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm20, %zmm15, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm31, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm31
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm9, %zmm31
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm9, %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm17, %zmm30, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10]
+; AVX512F-ONLY-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm1, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm2, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9]
+; AVX512F-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm3, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15]
+; AVX512F-ONLY-FAST-NEXT:    # ymm12 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm12, %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm2, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm1, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm3, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm10, %zmm14, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm10, %zmm14, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm10, %zmm14, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm12, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    movb $12, %al
+; AVX512F-ONLY-FAST-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    movb $48, %al
+; AVX512F-ONLY-FAST-NEXT:    kmovw %eax, %k2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm12 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm8 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm25 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm26 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm19 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7>
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%r8), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%r8), %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm0, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm13, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm13, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%r9), %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm13, %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%r9), %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm13, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm18 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm2, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm2, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm2, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm2, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm2, %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm2, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %xmm2
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rdx), %xmm2
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rdx), %xmm2
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $2, (%r8), %zmm11, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm11, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $2, 64(%r8), %zmm23, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm11, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $2, 128(%r8), %zmm6, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm11, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $2, 192(%r8), %zmm3, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm11, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdi), %ymm11
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm3, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    movb $16, %al
+; AVX512F-ONLY-FAST-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u>
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm23, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u>
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm30, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rdi), %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm28, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm23, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm30, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rdi), %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm27, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm31 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm23, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm30, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rdi), %ymm6
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm29, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm23, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm24 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm30, %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm7, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm29, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm28, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm7, %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm29, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm28, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm7, %zmm31
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm29, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm28, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm7, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm29, %zmm23
+; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, 1472(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, 1408(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, 1344(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 1280(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, 1216(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, 1088(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, 1024(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, 960(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 896(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, 832(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, 704(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, 640(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, 576(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 512(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, 320(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 1152(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 768(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, 384(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, (%rax)
+; AVX512F-ONLY-FAST-NEXT:    addq $712, %rsp # imm = 0x2C8
+; AVX512F-ONLY-FAST-NEXT:    vzeroupper
+; AVX512F-ONLY-FAST-NEXT:    retq
+;
+; AVX512DQ-SLOW-LABEL: store_i64_stride6_vf32:
+; AVX512DQ-SLOW:       # %bb.0:
+; AVX512DQ-SLOW-NEXT:    subq $712, %rsp # imm = 0x2C8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm30
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm21
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm10
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rdx), %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdx), %zmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm28
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm27
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm24
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm20
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13]
+; AVX512DQ-SLOW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm16 = [4,12,4,12]
+; AVX512DQ-SLOW-NEXT:    # ymm16 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm16, %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm16, %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm27, %zmm16, %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm28, %zmm6, %zmm16
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10]
+; AVX512DQ-SLOW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm28, %zmm19, %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm11, (%rsp) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14]
+; AVX512DQ-SLOW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm28, %zmm18, %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm27, %zmm19, %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm22
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm27, %zmm18, %zmm22
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm25
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm19, %zmm25
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm26
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm18, %zmm26
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11]
+; AVX512DQ-SLOW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm28, %zmm29, %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15]
+; AVX512DQ-SLOW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm28, %zmm31, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm28
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm27, %zmm29, %zmm28
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm27, %zmm31, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm27
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm29, %zmm27
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm31, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm24
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm9, %zmm24
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm20, %zmm15, %zmm19
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm20, %zmm15, %zmm18
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm20, %zmm15, %zmm29
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm31, %zmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm31
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm9, %zmm31
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm20
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm9, %zmm20
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm17, %zmm30, %zmm9
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10]
+; AVX512DQ-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm1, %zmm7
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm13
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm2, %zmm13
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9]
+; AVX512DQ-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm3, %zmm11
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15]
+; AVX512DQ-SLOW-NEXT:    # ymm12 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm12, %zmm30
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm21
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm21
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm23
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm23
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm1, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm2, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm3, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm12, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm10, %zmm14, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm10, %zmm14, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm10, %zmm14, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm12, %zmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    movb $12, %al
+; AVX512DQ-SLOW-NEXT:    kmovw %eax, %k1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT:    movb $48, %al
+; AVX512DQ-SLOW-NEXT:    kmovw %eax, %k2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm12 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm8 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm25 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r8), %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm26 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm19 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7>
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm25
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm19
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r9), %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm13, %zmm10
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm10, (%rsp) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm16
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm13, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm17
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm13, %zmm25
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm21
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm13, %zmm19
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm18 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7>
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm22
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm26
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm18
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm2, %zmm22
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm2, %zmm26
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm2, %zmm18
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm2
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm2
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rdx), %xmm2
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $2, (%r8), %zmm11, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm8
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $2, 64(%r8), %zmm23, %zmm13
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm11, %zmm13
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $2, 128(%r8), %zmm6, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm11, %zmm2
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $2, 192(%r8), %zmm3, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm11, %zmm10
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdi), %ymm11
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm3, %zmm11
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    movb $16, %al
+; AVX512DQ-SLOW-NEXT:    kmovw %eax, %k1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u>
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm23, %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u>
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm30, %zmm12
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm3
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm28, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm23, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm30, %zmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rdi), %ymm4
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm27, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm31 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm23, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm30, %zmm27
+; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm6
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm29, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm23, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm24 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm30, %zmm23
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm7, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm29, %zmm12
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm28, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm7, %zmm20
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm29, %zmm14
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm28, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm7, %zmm31
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm29, %zmm27
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm28, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm7, %zmm24
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm29, %zmm23
+; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, 1472(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, 1408(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, 1344(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, 1280(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, 1216(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm27, 1088(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, 1024(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, 960(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, 896(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, 832(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, 704(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, 640(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, 576(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, 512(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, 320(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, 192(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, 128(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, 1152(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, 768(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, 384(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, (%rax)
+; AVX512DQ-SLOW-NEXT:    addq $712, %rsp # imm = 0x2C8
+; AVX512DQ-SLOW-NEXT:    vzeroupper
+; AVX512DQ-SLOW-NEXT:    retq
+;
+; AVX512DQ-FAST-LABEL: store_i64_stride6_vf32:
+; AVX512DQ-FAST:       # %bb.0:
+; AVX512DQ-FAST-NEXT:    subq $712, %rsp # imm = 0x2C8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdi), %zmm30
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rsi), %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm21
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm10
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdx), %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rdx), %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rdx), %zmm15
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rcx), %zmm28
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm27
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm24
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm20
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13]
+; AVX512DQ-FAST-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm16 = [4,12,4,12]
+; AVX512DQ-FAST-NEXT:    # ymm16 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm15, %zmm11
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm16, %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm11
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm24, %zmm16, %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm11
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm27, %zmm16, %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm28, %zmm6, %zmm16
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10]
+; AVX512DQ-FAST-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm28, %zmm19, %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm11, (%rsp) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14]
+; AVX512DQ-FAST-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm28, %zmm18, %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm11
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm27, %zmm19, %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm22
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm27, %zmm18, %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm25
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm24, %zmm19, %zmm25
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm26
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm24, %zmm18, %zmm26
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11]
+; AVX512DQ-FAST-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm28, %zmm29, %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15]
+; AVX512DQ-FAST-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm28, %zmm31, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm28
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm27, %zmm29, %zmm28
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm27, %zmm31, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm27
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm24, %zmm29, %zmm27
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm24, %zmm31, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, %zmm24
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm9, %zmm24
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm20, %zmm15, %zmm19
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm20, %zmm15, %zmm18
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm20, %zmm15, %zmm29
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm31, %zmm15
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm31
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm9, %zmm31
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm20
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm9, %zmm20
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm17, %zmm30, %zmm9
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10]
+; AVX512DQ-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm1, %zmm7
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, %zmm13
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm2, %zmm13
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9]
+; AVX512DQ-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, %zmm11
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm3, %zmm11
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15]
+; AVX512DQ-FAST-NEXT:    # ymm12 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm12, %zmm30
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm21
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm2, %zmm21
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm23
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm23
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm1, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm2, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm3, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm12, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm10, %zmm14, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm10, %zmm14, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm10, %zmm14, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm12, %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    movb $12, %al
+; AVX512DQ-FAST-NEXT:    kmovw %eax, %k1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT:    movb $48, %al
+; AVX512DQ-FAST-NEXT:    kmovw %eax, %k2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm12 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm8 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm25 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r8), %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm26 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r8), %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm19 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7>
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm10
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%r8), %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm25
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%r8), %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm0, %zmm19
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r9), %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm13, %zmm10
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm10, (%rsp) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r9), %zmm16
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm13, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%r9), %zmm17
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm13, %zmm25
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%r9), %zmm21
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm13, %zmm19
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm18 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm2, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm22
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm26
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm2, %zmm18
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm2, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm2, %zmm22
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm2, %zmm26
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm2, %zmm18
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rdx), %xmm2
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rdx), %xmm2
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rdx), %xmm2
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $2, (%r8), %zmm11, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm11, %zmm8
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $2, 64(%r8), %zmm23, %zmm13
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm11, %zmm13
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $2, 128(%r8), %zmm6, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm11, %zmm2
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $2, 192(%r8), %zmm3, %zmm10
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm11, %zmm10
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %ymm11
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm3, %zmm11
+; AVX512DQ-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    movb $16, %al
+; AVX512DQ-FAST-NEXT:    kmovw %eax, %k1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u>
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm23, %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u>
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm30, %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rdi), %ymm3
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm28, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm23, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm30, %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rdi), %ymm4
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm27, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm31 {%k1}
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm23, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm30, %zmm27
+; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rdi), %ymm6
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm29, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm23, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm24 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm11
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm30, %zmm23
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm7, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm29, %zmm12
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm28, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm7, %zmm20
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm29, %zmm14
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm28, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm7, %zmm31
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm29, %zmm27
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm28, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm7, %zmm24
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm29, %zmm23
+; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, 1472(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, 1408(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, 1344(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, 1280(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, 1216(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, 1088(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, 1024(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, 960(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, 896(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, 832(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, 704(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, 640(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, 576(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, 512(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, 320(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, 192(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, 128(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, 1152(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, 768(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, 384(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, (%rax)
+; AVX512DQ-FAST-NEXT:    addq $712, %rsp # imm = 0x2C8
+; AVX512DQ-FAST-NEXT:    vzeroupper
+; AVX512DQ-FAST-NEXT:    retq
+;
+; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride6_vf32:
+; AVX512BW-ONLY-SLOW:       # %bb.0:
+; AVX512BW-ONLY-SLOW-NEXT:    subq $712, %rsp # imm = 0x2C8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdx), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdx), %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm16 = [4,12,4,12]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm16, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm16, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm16, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm28, %zmm6, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm19, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm18, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm19, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm18, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm19, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm18, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm29, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm31, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm29, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm31, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm29, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm31, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm9, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm20, %zmm15, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm20, %zmm15, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm20, %zmm15, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm31, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm9, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm9, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm17, %zmm30, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm1, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm2, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm3, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm12 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm12, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm1, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm2, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm3, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm12, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm10, %zmm14, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm10, %zmm14, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm10, %zmm14, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm12, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    movb $12, %al
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %eax, %k1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    movb $48, %al
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %eax, %k2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm12 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm8 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm25 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm26 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm19 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm13, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm13, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm13, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm13, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm18 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm2, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm2, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm2, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rdx), %xmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $2, (%r8), %zmm11, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $2, 64(%r8), %zmm23, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm11, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $2, 128(%r8), %zmm6, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm11, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $2, 192(%r8), %zmm3, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm11, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %ymm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm3, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    movb $16, %al
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %eax, %k1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm23, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm30, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm28, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm23, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm30, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rdi), %ymm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm27, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm31 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm23, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm30, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm29, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm23, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm24 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm30, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm7, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm29, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm28, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm7, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm29, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm28, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm7, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm29, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm28, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm7, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm29, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, 1472(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, 1408(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, 1344(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, 1280(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, 1216(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, 1088(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, 1024(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, 960(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 896(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, 832(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, 704(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, 640(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, 576(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 512(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, 320(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 1152(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 768(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, 384(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    addq $712, %rsp # imm = 0x2C8
+; AVX512BW-ONLY-SLOW-NEXT:    vzeroupper
+; AVX512BW-ONLY-SLOW-NEXT:    retq
+;
+; AVX512BW-ONLY-FAST-LABEL: store_i64_stride6_vf32:
+; AVX512BW-ONLY-FAST:       # %bb.0:
+; AVX512BW-ONLY-FAST-NEXT:    subq $712, %rsp # imm = 0x2C8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rdx), %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdx), %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm16 = [4,12,4,12]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm16, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm16, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm16, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm28, %zmm6, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm19, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm18, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm19, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm18, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm19, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm18, %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm29, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm31, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm29, %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm31, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm29, %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm31, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm9, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm20, %zmm15, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm20, %zmm15, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm20, %zmm15, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm31, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm9, %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm9, %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm17, %zmm30, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm1, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm2, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm3, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm12 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm12, %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm2, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm1, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm2, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm3, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm10, %zmm14, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm10, %zmm14, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm10, %zmm14, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm12, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    movb $12, %al
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %eax, %k1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    movb $48, %al
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %eax, %k2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm12 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm8 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm25 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm26 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm19 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7>
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%r8), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%r8), %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm0, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm13, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm13, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%r9), %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm13, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%r9), %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm13, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm18 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7>
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm2, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm2, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm2, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm2, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm2, %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm2, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %xmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rdx), %xmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rdx), %xmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $2, (%r8), %zmm11, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm11, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $2, 64(%r8), %zmm23, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm11, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $2, 128(%r8), %zmm6, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm11, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $2, 192(%r8), %zmm3, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm11, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdi), %ymm11
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm3, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    movb $16, %al
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %eax, %k1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u>
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm23, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u>
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm30, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rdi), %ymm3
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm28, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm23, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm30, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rdi), %ymm4
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm27, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm31 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm23, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm30, %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rdi), %ymm6
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm29, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm23, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm24 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm30, %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm7, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm29, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm28, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm7, %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm29, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm28, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm7, %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm29, %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm28, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm7, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm29, %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, 1472(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, 1408(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, 1344(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 1280(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, 1216(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, 1088(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, 1024(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, 960(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 896(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, 832(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, 704(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, 640(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, 576(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 512(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, 320(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, 128(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 1152(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 768(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, 384(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, (%rax)
+; AVX512BW-ONLY-FAST-NEXT:    addq $712, %rsp # imm = 0x2C8
+; AVX512BW-ONLY-FAST-NEXT:    vzeroupper
+; AVX512BW-ONLY-FAST-NEXT:    retq
+;
+; AVX512DQBW-SLOW-LABEL: store_i64_stride6_vf32:
+; AVX512DQBW-SLOW:       # %bb.0:
+; AVX512DQBW-SLOW-NEXT:    subq $712, %rsp # imm = 0x2C8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm30
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm21
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rdx), %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdx), %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm28
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm27
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm24
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm20
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13]
+; AVX512DQBW-SLOW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm16 = [4,12,4,12]
+; AVX512DQBW-SLOW-NEXT:    # ymm16 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm16, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm16, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm27, %zmm16, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm28, %zmm6, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10]
+; AVX512DQBW-SLOW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm28, %zmm19, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm11, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14]
+; AVX512DQBW-SLOW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm28, %zmm18, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm27, %zmm19, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm27, %zmm18, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm19, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm26
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm18, %zmm26
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11]
+; AVX512DQBW-SLOW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm28, %zmm29, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15]
+; AVX512DQBW-SLOW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm28, %zmm31, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm28
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm27, %zmm29, %zmm28
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm27, %zmm31, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm29, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm31, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm9, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm20, %zmm15, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm20, %zmm15, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm20, %zmm15, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm31, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm31
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm9, %zmm31
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm9, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm17, %zmm30, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10]
+; AVX512DQBW-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm1, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm2, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9]
+; AVX512DQBW-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm3, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15]
+; AVX512DQBW-SLOW-NEXT:    # ymm12 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm12, %zmm30
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm1, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm2, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm3, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm12, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm10, %zmm14, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm10, %zmm14, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm10, %zmm14, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm12, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    movb $12, %al
+; AVX512DQBW-SLOW-NEXT:    kmovd %eax, %k1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT:    movb $48, %al
+; AVX512DQBW-SLOW-NEXT:    kmovd %eax, %k2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm12 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm8 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm25 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r8), %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm26 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm19 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7>
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r9), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm13, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm10, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm16
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm13, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm17
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm13, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm21
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm13, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm18 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm26
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm2, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm2, %zmm26
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm2, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm2
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm2
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rdx), %xmm2
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $2, (%r8), %zmm11, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $2, 64(%r8), %zmm23, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm11, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $2, 128(%r8), %zmm6, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm11, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $2, 192(%r8), %zmm3, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm11, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdi), %ymm11
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm3, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    movb $16, %al
+; AVX512DQBW-SLOW-NEXT:    kmovd %eax, %k1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u>
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm23, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u>
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm30, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm3
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm28, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm23, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm30, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rdi), %ymm4
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm27, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm31 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm23, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm30, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm6
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm29, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm23, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm24 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm30, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm7, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm29, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm28, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm7, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm29, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm28, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm7, %zmm31
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm29, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm28, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm7, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm29, %zmm23
+; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, 1472(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, 1408(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, 1344(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, 1280(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, 1216(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm27, 1088(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, 1024(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, 960(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, 896(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, 832(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, 704(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, 640(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, 576(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, 512(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, 320(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, 192(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, 128(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, 1152(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, 768(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, 384(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, (%rax)
+; AVX512DQBW-SLOW-NEXT:    addq $712, %rsp # imm = 0x2C8
+; AVX512DQBW-SLOW-NEXT:    vzeroupper
+; AVX512DQBW-SLOW-NEXT:    retq
+;
+; AVX512DQBW-FAST-LABEL: store_i64_stride6_vf32:
+; AVX512DQBW-FAST:       # %bb.0:
+; AVX512DQBW-FAST-NEXT:    subq $712, %rsp # imm = 0x2C8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm30
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rsi), %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm21
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdx), %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rdx), %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdx), %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rcx), %zmm28
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm27
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm24
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm20
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13]
+; AVX512DQBW-FAST-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm16 = [4,12,4,12]
+; AVX512DQBW-FAST-NEXT:    # ymm16 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm15, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm16, %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm24, %zmm16, %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm27, %zmm16, %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm28, %zmm6, %zmm16
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10]
+; AVX512DQBW-FAST-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm28, %zmm19, %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm11, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14]
+; AVX512DQBW-FAST-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm28, %zmm18, %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm27, %zmm19, %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm22
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm27, %zmm18, %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm25
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm24, %zmm19, %zmm25
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm26
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm24, %zmm18, %zmm26
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11]
+; AVX512DQBW-FAST-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm28, %zmm29, %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15]
+; AVX512DQBW-FAST-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm28, %zmm31, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm28
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm27, %zmm29, %zmm28
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm27, %zmm31, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm27
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm24, %zmm29, %zmm27
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm24, %zmm31, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm14, %zmm24
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm9, %zmm24
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm20, %zmm15, %zmm19
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm20, %zmm15, %zmm18
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm20, %zmm15, %zmm29
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm31, %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm31
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm9, %zmm31
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm20
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm9, %zmm20
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm17, %zmm30, %zmm9
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10]
+; AVX512DQBW-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm1, %zmm7
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm2, %zmm13
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9]
+; AVX512DQBW-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm3, %zmm11
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15]
+; AVX512DQBW-FAST-NEXT:    # ymm12 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm12, %zmm30
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm21
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm2, %zmm21
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm23
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm23
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm1, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm2, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm3, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm12, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm10, %zmm14, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm10, %zmm14, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm10, %zmm14, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm12, %zmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    movb $12, %al
+; AVX512DQBW-FAST-NEXT:    kmovd %eax, %k1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT:    movb $48, %al
+; AVX512DQBW-FAST-NEXT:    kmovd %eax, %k2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm12 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm8 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm25 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r8), %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm26 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r8), %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm19 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7>
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%r8), %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm25
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%r8), %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm0, %zmm19
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r9), %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm13, %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm10, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r9), %zmm16
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm13, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%r9), %zmm17
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm13, %zmm25
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%r9), %zmm21
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm13, %zmm19
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm18 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7>
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm2, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm22
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm26
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm2, %zmm18
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm2, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm2, %zmm22
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm2, %zmm26
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm2, %zmm18
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rdx), %xmm2
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rdx), %xmm2
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rdx), %xmm2
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $2, (%r8), %zmm11, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm11, %zmm8
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $2, 64(%r8), %zmm23, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm11, %zmm13
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $2, 128(%r8), %zmm6, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm11, %zmm2
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $2, 192(%r8), %zmm3, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm11, %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdi), %ymm11
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm3, %zmm11
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    movb $16, %al
+; AVX512DQBW-FAST-NEXT:    kmovd %eax, %k1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u>
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm23, %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u>
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm30, %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rdi), %ymm3
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm28, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm23, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm30, %zmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rdi), %ymm4
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm27, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm31 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm23, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm30, %zmm27
+; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rdi), %ymm6
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm29, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm23, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm24 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm30, %zmm23
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm7, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm29, %zmm12
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm28, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm7, %zmm20
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm29, %zmm14
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm28, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm7, %zmm31
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm29, %zmm27
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm28, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm7, %zmm24
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm29, %zmm23
+; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, 1472(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, 1408(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, 1344(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, 1280(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, 1216(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, 1088(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, 1024(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, 960(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, 896(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, 832(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm14, 704(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, 640(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, 576(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, 512(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, 320(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, 192(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, 128(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, 1152(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, 768(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, 384(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, (%rax)
+; AVX512DQBW-FAST-NEXT:    addq $712, %rsp # imm = 0x2C8
+; AVX512DQBW-FAST-NEXT:    vzeroupper
+; AVX512DQBW-FAST-NEXT:    retq
   %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64
   %in.vec1 = load <32 x i64>, ptr %in.vecptr1, align 64
   %in.vec2 = load <32 x i64>, ptr %in.vecptr2, align 64
@@ -6827,7 +9371,8 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqa64 320(%rcx), %zmm23
 ; AVX512F-NEXT:    vmovdqa64 384(%rcx), %zmm19
 ; AVX512F-NEXT:    vmovdqa64 448(%rcx), %zmm10
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,4,12>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [4,12,4,12]
+; AVX512F-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm9
 ; AVX512F-NEXT:    vpermt2q %zmm10, %zmm8, %zmm9
 ; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -6996,7 +9541,8 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm1
 ; AVX512F-NEXT:    vpermt2q %zmm23, %zmm4, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,7,15>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [7,15,7,15]
+; AVX512F-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpermt2q %zmm23, %zmm1, %zmm24
 ; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm9
 ; AVX512F-NEXT:    vpermt2q %zmm22, %zmm25, %zmm9
@@ -7496,7 +10042,8 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 320(%rcx), %zmm23
 ; AVX512BW-NEXT:    vmovdqa64 384(%rcx), %zmm19
 ; AVX512BW-NEXT:    vmovdqa64 448(%rcx), %zmm10
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,4,12>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [4,12,4,12]
+; AVX512BW-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm9
 ; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm8, %zmm9
 ; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -7665,7 +10212,8 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm1
 ; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm4, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,7,15>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [7,15,7,15]
+; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm1, %zmm24
 ; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm9
 ; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm25, %zmm9
@@ -8169,16 +10717,8 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE: {{.*}}
 ; AVX2-SLOW: {{.*}}
 ; AVX512BW-FAST: {{.*}}
-; AVX512BW-ONLY-FAST: {{.*}}
-; AVX512BW-ONLY-SLOW: {{.*}}
 ; AVX512BW-SLOW: {{.*}}
-; AVX512DQ-FAST: {{.*}}
-; AVX512DQ-SLOW: {{.*}}
-; AVX512DQBW-FAST: {{.*}}
-; AVX512DQBW-SLOW: {{.*}}
 ; AVX512F-FAST: {{.*}}
-; AVX512F-ONLY-FAST: {{.*}}
-; AVX512F-ONLY-SLOW: {{.*}}
 ; AVX512F-SLOW: {{.*}}
 ; FALLBACK0: {{.*}}
 ; FALLBACK1: {{.*}}

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll
index ef9165d5cbf8b..43d48efd579e2 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll
@@ -770,7 +770,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm11
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm5, %zmm6
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,7,15,u>
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm7
 ; AVX512F-ONLY-SLOW-NEXT:    movb $24, %sil
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k1
@@ -799,7 +800,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14]
 ; AVX512F-ONLY-SLOW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,6,14>
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm9
 ; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    movb $-61, %sil
@@ -892,7 +894,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm7, %zmm6, %zmm11
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm5, %zmm6
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,7,15,u>
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7]
+; AVX512F-ONLY-FAST-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm7
 ; AVX512F-ONLY-FAST-NEXT:    movb $24, %sil
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k1
@@ -921,7 +924,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14]
 ; AVX512F-ONLY-FAST-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,6,14>
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm9
 ; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    movb $-61, %sil
@@ -1007,7 +1011,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14]
 ; AVX512DQ-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,6,14>
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14]
+; AVX512DQ-SLOW-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm9
 ; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm5[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    movb $-61, %sil
@@ -1041,7 +1046,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm8, %zmm7, %zmm14
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm11, %zmm7
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm11
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,7,15,u>
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7]
+; AVX512DQ-SLOW-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm8
 ; AVX512DQ-SLOW-NEXT:    movb $24, %sil
 ; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k2
@@ -1128,7 +1134,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14]
 ; AVX512DQ-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,6,14>
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14]
+; AVX512DQ-FAST-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm9
 ; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm5[4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    movb $-61, %sil
@@ -1174,7 +1181,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm8, %zmm7, %zmm13
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm10, %zmm7
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm10
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,7,15,u>
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7]
+; AVX512DQ-FAST-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm8
 ; AVX512DQ-FAST-NEXT:    movb $24, %sil
 ; AVX512DQ-FAST-NEXT:    kmovw %esi, %k2
@@ -1256,7 +1264,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm11
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm5, %zmm6
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,7,15,u>
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm7
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $24, %sil
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k1
@@ -1285,7 +1294,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14]
 ; AVX512BW-ONLY-SLOW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,6,14>
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm9
 ; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $-61, %sil
@@ -1378,7 +1388,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm7, %zmm6, %zmm11
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm5, %zmm6
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,7,15,u>
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm7
 ; AVX512BW-ONLY-FAST-NEXT:    movb $24, %sil
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k1
@@ -1407,7 +1418,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14]
 ; AVX512BW-ONLY-FAST-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,6,14>
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm9
 ; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    movb $-61, %sil
@@ -1493,7 +1505,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14]
 ; AVX512DQBW-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,6,14>
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14]
+; AVX512DQBW-SLOW-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm9
 ; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm5[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    movb $-61, %sil
@@ -1527,7 +1540,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm8, %zmm7, %zmm14
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm11, %zmm7
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm11
-; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,7,15,u>
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7]
+; AVX512DQBW-SLOW-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm8
 ; AVX512DQBW-SLOW-NEXT:    movb $24, %sil
 ; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k2
@@ -1614,7 +1628,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14]
 ; AVX512DQBW-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,6,14>
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14]
+; AVX512DQBW-FAST-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm9
 ; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm5[4,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    movb $-61, %sil
@@ -1660,7 +1675,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm8, %zmm7, %zmm13
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm10, %zmm7
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm10
-; AVX512DQBW-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,7,15,u>
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7]
+; AVX512DQBW-FAST-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm8
 ; AVX512DQBW-FAST-NEXT:    movb $24, %sil
 ; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k2
@@ -2497,9 +2513,11 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm11
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm11
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,6,14>
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,7,15,u>
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [15,7,15,7]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm14 = mem[0,1,0,1]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm12
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm14, %zmm12
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -2615,7 +2633,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k3
 ; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm6[0],zmm2[0],zmm6[2],zmm2[2],zmm6[4],zmm2[4],zmm6[6],zmm2[6]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm22, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,6,14>
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm8, %zmm3
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm8, %zmm22
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm8, %zmm7, %zmm28
@@ -2708,7 +2727,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm16
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm20, %zmm16
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm21 = <u,u,6,14>
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm21 = [6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT:    # ymm21 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm17
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm21, %zmm17
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm12
@@ -2768,7 +2788,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm25, %zmm13, %zmm28
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm29
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm26, %zmm29
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm27 = <u,7,15,u>
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm27 = [15,7,15,7]
+; AVX512F-ONLY-FAST-NEXT:    # ymm27 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm25
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm27, %zmm25
 ; AVX512F-ONLY-FAST-NEXT:    movb $24, %dil
@@ -2917,9 +2938,11 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm11
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm11
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,6,14>
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14]
+; AVX512DQ-SLOW-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm13
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,7,15,u>
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [15,7,15,7]
+; AVX512DQ-SLOW-NEXT:    # ymm14 = mem[0,1,0,1]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm14, %zmm7
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -3038,7 +3061,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k3
 ; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm6[0],zmm2[0],zmm6[2],zmm2[2],zmm6[4],zmm2[4],zmm6[6],zmm2[6]
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm16, %zmm6
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,6,14>
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14]
+; AVX512DQ-SLOW-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm4, %zmm3
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm4, %zmm16
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm4, %zmm0, %zmm23
@@ -3128,7 +3152,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm16
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm31, %zmm19, %zmm16
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm20 = <u,u,6,14>
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm20 = [6,14,6,14]
+; AVX512DQ-FAST-NEXT:    # ymm20 = mem[0,1,0,1]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm18
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm20, %zmm18
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r8), %zmm12
@@ -3186,7 +3211,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm24, %zmm13, %zmm28
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm29
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm14, %zmm25, %zmm29
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm26 = <u,7,15,u>
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm26 = [15,7,15,7]
+; AVX512DQ-FAST-NEXT:    # ymm26 = mem[0,1,0,1]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm24
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm26, %zmm24
 ; AVX512DQ-FAST-NEXT:    movb $24, %dil
@@ -3335,9 +3361,11 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm11
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm11
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,6,14>
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,7,15,u>
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [15,7,15,7]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm14 = mem[0,1,0,1]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm12
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm14, %zmm12
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -3453,7 +3481,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k3
 ; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm6[0],zmm2[0],zmm6[2],zmm2[2],zmm6[4],zmm2[4],zmm6[6],zmm2[6]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm22, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,6,14>
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm8, %zmm3
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm8, %zmm22
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm8, %zmm7, %zmm28
@@ -3546,7 +3575,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-FAST-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm16
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm20, %zmm16
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm21 = <u,u,6,14>
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm21 = [6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm21 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm17
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm21, %zmm17
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm12
@@ -3606,7 +3636,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm25, %zmm13, %zmm28
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm29
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm26, %zmm29
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm27 = <u,7,15,u>
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm27 = [15,7,15,7]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm27 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm25
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm27, %zmm25
 ; AVX512BW-ONLY-FAST-NEXT:    movb $24, %dil
@@ -3755,9 +3786,11 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm11
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm11
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,6,14>
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14]
+; AVX512DQBW-SLOW-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,7,15,u>
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [15,7,15,7]
+; AVX512DQBW-SLOW-NEXT:    # ymm14 = mem[0,1,0,1]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm14, %zmm7
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -3876,7 +3909,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k3
 ; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm6[0],zmm2[0],zmm6[2],zmm2[2],zmm6[4],zmm2[4],zmm6[6],zmm2[6]
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm16, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,6,14>
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14]
+; AVX512DQBW-SLOW-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm4, %zmm3
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm4, %zmm16
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm4, %zmm0, %zmm23
@@ -3966,7 +4000,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-FAST-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm16
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm31, %zmm19, %zmm16
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm20 = <u,u,6,14>
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm20 = [6,14,6,14]
+; AVX512DQBW-FAST-NEXT:    # ymm20 = mem[0,1,0,1]
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm18
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm20, %zmm18
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r8), %zmm12
@@ -4024,7 +4059,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm24, %zmm13, %zmm28
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm29
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm14, %zmm25, %zmm29
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm26 = <u,7,15,u>
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm26 = [15,7,15,7]
+; AVX512DQBW-FAST-NEXT:    # ymm26 = mem[0,1,0,1]
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm24
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm26, %zmm24
 ; AVX512DQBW-FAST-NEXT:    movb $24, %dil
@@ -5919,9 +5955,11 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm5, %zmm30
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm6, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,6,14>
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm19
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm20 = <u,7,15,u>
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm20 = [15,7,15,7]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm20 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm20, %zmm13
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm8
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm3, %zmm8
@@ -6329,11 +6367,13 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm1[0],zmm11[0],zmm1[2],zmm11[2],zmm1[4],zmm11[4],zmm1[6],zmm11[6]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm9
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm3, %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,6,14>
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm6, %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,7,15,u>
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7]
+; AVX512F-ONLY-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm2, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm19
@@ -6389,7 +6429,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm28, %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm28, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm22 = <u,7,15,u>
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm22 = [15,7,15,7]
+; AVX512F-ONLY-FAST-NEXT:    # ymm22 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm22, %zmm28
 ; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm13 {%k3} = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6]
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm29
@@ -6801,9 +6842,11 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm4, %zmm29
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm6, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,6,14>
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [6,14,6,14]
+; AVX512DQ-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm2, %zmm8
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm22 = <u,7,15,u>
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm22 = [15,7,15,7]
+; AVX512DQ-SLOW-NEXT:    # ymm22 = mem[0,1,0,1]
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm22, %zmm17
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm15
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm5, %zmm15
@@ -7211,10 +7254,12 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm5[0],zmm3[0],zmm5[2],zmm3[2],zmm5[4],zmm3[4],zmm5[6],zmm3[6]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm9
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm1, %zmm9
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,6,14>
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14]
+; AVX512DQ-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm25
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm5, %zmm25
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,7,15,u>
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7]
+; AVX512DQ-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm16
@@ -7268,7 +7313,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm30[0],zmm8[0],zmm30[2],zmm8[2],zmm30[4],zmm8[4],zmm30[6],zmm8[6]
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm8, %zmm30, %zmm1
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm8, %zmm30, %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm20 = <u,7,15,u>
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm20 = [15,7,15,7]
+; AVX512DQ-FAST-NEXT:    # ymm20 = mem[0,1,0,1]
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm20, %zmm30
 ; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6]
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm7
@@ -7674,9 +7720,11 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm5, %zmm30
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm6, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,6,14>
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm19
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm20 = <u,7,15,u>
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm20 = [15,7,15,7]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm20 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm20, %zmm13
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm8
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm3, %zmm8
@@ -8084,11 +8132,13 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm1[0],zmm11[0],zmm1[2],zmm11[2],zmm1[4],zmm11[4],zmm1[6],zmm11[6]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm9
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm3, %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,6,14>
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm6, %zmm2
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,7,15,u>
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm2, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm19
@@ -8144,7 +8194,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm28, %zmm3
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm28, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm22 = <u,7,15,u>
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm22 = [15,7,15,7]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm22 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm22, %zmm28
 ; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm13 {%k3} = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm29
@@ -8556,9 +8607,11 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm4, %zmm29
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm6, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,6,14>
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [6,14,6,14]
+; AVX512DQBW-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm2, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm22 = <u,7,15,u>
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm22 = [15,7,15,7]
+; AVX512DQBW-SLOW-NEXT:    # ymm22 = mem[0,1,0,1]
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm22, %zmm17
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm15
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm5, %zmm15
@@ -8966,10 +9019,12 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm5[0],zmm3[0],zmm5[2],zmm3[2],zmm5[4],zmm3[4],zmm5[6],zmm3[6]
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm9
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm1, %zmm9
-; AVX512DQBW-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,6,14>
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14]
+; AVX512DQBW-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm25
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm5, %zmm25
-; AVX512DQBW-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,7,15,u>
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7]
+; AVX512DQBW-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, %zmm16
@@ -9023,7 +9078,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm30[0],zmm8[0],zmm30[2],zmm8[2],zmm30[4],zmm8[4],zmm30[6],zmm8[6]
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm8, %zmm30, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm8, %zmm30, %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm20 = <u,7,15,u>
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm20 = [15,7,15,7]
+; AVX512DQBW-FAST-NEXT:    # ymm20 = mem[0,1,0,1]
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm20, %zmm30
 ; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6]
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm7
@@ -12937,10 +12993,12 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm17, %zmm3
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,6,14>
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm8, %zmm7
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,7,15,u>
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm7, %zmm10
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3
@@ -13824,11 +13882,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm10, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm23 = <u,u,6,14>
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm23 = [6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT:    # ymm23 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm23, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm27 = <u,7,15,u>
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm27 = [15,7,15,7]
+; AVX512F-ONLY-FAST-NEXT:    # ymm27 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm27, %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -14836,10 +14896,12 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm17, %zmm3
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,6,14>
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [6,14,6,14]
+; AVX512DQ-SLOW-NEXT:    # ymm14 = mem[0,1,0,1]
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm7
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,7,15,u>
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7]
+; AVX512DQ-SLOW-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm7, %zmm19
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
@@ -15721,11 +15783,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm8, %zmm1
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm23 = <u,u,6,14>
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm23 = [6,14,6,14]
+; AVX512DQ-FAST-NEXT:    # ymm23 = mem[0,1,0,1]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm23, %zmm1
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm28 = <u,7,15,u>
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm28 = [15,7,15,7]
+; AVX512DQ-FAST-NEXT:    # ymm28 = mem[0,1,0,1]
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm3
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -16728,10 +16792,12 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm17, %zmm3
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,6,14>
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm8, %zmm7
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,7,15,u>
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm7, %zmm10
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3
@@ -17615,11 +17681,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm10, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm23 = <u,u,6,14>
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm23 = [6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm23 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm23, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm27 = <u,7,15,u>
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm27 = [15,7,15,7]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm27 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm27, %zmm3
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -18627,10 +18695,12 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm17, %zmm3
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,6,14>
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [6,14,6,14]
+; AVX512DQBW-SLOW-NEXT:    # ymm14 = mem[0,1,0,1]
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm7
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,7,15,u>
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7]
+; AVX512DQBW-SLOW-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm7, %zmm19
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
@@ -19512,11 +19582,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm8, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm23 = <u,u,6,14>
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm23 = [6,14,6,14]
+; AVX512DQBW-FAST-NEXT:    # ymm23 = mem[0,1,0,1]
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm23, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm28 = <u,7,15,u>
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm28 = [15,7,15,7]
+; AVX512DQBW-FAST-NEXT:    # ymm28 = mem[0,1,0,1]
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm3
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll
index adcb0f5815815..083c206fe9356 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll
@@ -821,7 +821,8 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    vpermt2q %zmm2, %zmm5, %zmm13
 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6]
 ; AVX512F-NEXT:    vpermi2q %zmm9, %zmm6, %zmm5
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,6,14>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14]
+; AVX512F-NEXT:    # ymm15 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpermi2q %zmm8, %zmm7, %zmm15
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm15[4,5,6,7]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm5
@@ -831,7 +832,8 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    vpermt2q %zmm2, %zmm13, %zmm15
 ; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7]
 ; AVX512F-NEXT:    vpermi2q %zmm9, %zmm6, %zmm13
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,7,15>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpermi2q %zmm8, %zmm7, %zmm4
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm15, %zmm13
@@ -842,7 +844,8 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm17 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6]
 ; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm17 {%k1}
 ; AVX512F-NEXT:    vpermi2q %zmm9, %zmm6, %zmm4
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,4,12>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12]
+; AVX512F-NEXT:    # ymm15 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpermi2q %zmm8, %zmm7, %zmm15
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm17, %zmm4
@@ -853,7 +856,8 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm18 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7]
 ; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm18 {%k1}
 ; AVX512F-NEXT:    vpermt2q %zmm9, %zmm15, %zmm6
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,5,13>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13]
+; AVX512F-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpermi2q %zmm8, %zmm7, %zmm9
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm18, %zmm6
@@ -936,7 +940,8 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm5, %zmm13
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6]
 ; AVX512BW-NEXT:    vpermi2q %zmm9, %zmm6, %zmm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,6,14>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14]
+; AVX512BW-NEXT:    # ymm15 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2q %zmm8, %zmm7, %zmm15
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm15[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm5
@@ -946,7 +951,8 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm13, %zmm15
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7]
 ; AVX512BW-NEXT:    vpermi2q %zmm9, %zmm6, %zmm13
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,7,15>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15]
+; AVX512BW-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2q %zmm8, %zmm7, %zmm4
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm15, %zmm13
@@ -957,7 +963,8 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm17 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm17 {%k1}
 ; AVX512BW-NEXT:    vpermi2q %zmm9, %zmm6, %zmm4
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,4,12>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12]
+; AVX512BW-NEXT:    # ymm15 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2q %zmm8, %zmm7, %zmm15
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm17, %zmm4
@@ -968,7 +975,8 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm18 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm18 {%k1}
 ; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm15, %zmm6
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,5,13>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13]
+; AVX512BW-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2q %zmm8, %zmm7, %zmm9
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm18, %zmm6
@@ -1896,423 +1904,1709 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;
-; AVX512F-LABEL: store_i64_stride8_vf16:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm7
-; AVX512F-NEXT:    vmovdqa64 (%rsi), %zmm19
-; AVX512F-NEXT:    vmovdqa64 (%rdx), %zmm9
-; AVX512F-NEXT:    vmovdqa64 (%rcx), %zmm18
-; AVX512F-NEXT:    vmovdqa64 (%r8), %zmm5
-; AVX512F-NEXT:    vmovdqa64 64(%r8), %zmm0
-; AVX512F-NEXT:    vmovdqa64 (%r9), %zmm30
-; AVX512F-NEXT:    vmovdqa64 64(%r9), %zmm27
-; AVX512F-NEXT:    vmovdqa64 (%r10), %zmm8
-; AVX512F-NEXT:    vmovdqa64 64(%r10), %zmm1
-; AVX512F-NEXT:    vmovdqa64 (%rax), %zmm31
-; AVX512F-NEXT:    vmovdqa64 64(%rax), %zmm29
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
-; AVX512F-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm31, %zmm14, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm30, %zmm14, %zmm3
-; AVX512F-NEXT:    movb $-64, %al
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-NEXT:    vmovdqa (%rcx), %xmm2
-; AVX512F-NEXT:    vmovdqa64 64(%rcx), %xmm16
-; AVX512F-NEXT:    vmovdqa (%rdx), %xmm6
-; AVX512F-NEXT:    vmovdqa64 64(%rdx), %xmm17
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1]
-; AVX512F-NEXT:    vmovdqa (%rsi), %xmm11
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm12
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm10, %ymm20, %ymm10
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm3, %zmm21
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
-; AVX512F-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm31, %zmm20, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm30, %zmm20, %zmm10
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k1}
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0]
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm22
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13]
-; AVX512F-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm31, %zmm3, %zmm2
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm6 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,5,13>
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm11, %zmm10
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm6, %zmm23
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12]
-; AVX512F-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm31, %zmm6, %zmm2
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm6, %zmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,4,12>
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm15
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm12, %zmm15
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm24
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15]
-; AVX512F-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm15
-; AVX512F-NEXT:    vpermt2q %zmm30, %zmm10, %zmm15
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm13
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm10, %zmm13
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,7,15>
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm15, %zmm25
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
-; AVX512F-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm30, %zmm26, %zmm4
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6]
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm26, %zmm7
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm19 = <u,u,6,14>
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm19, %zmm9
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm4, %zmm18
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm14, %zmm4
-; AVX512F-NEXT:    vpermi2q %zmm27, %zmm0, %zmm14
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm14 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1]
-; AVX512F-NEXT:    vmovdqa 64(%rsi), %xmm7
-; AVX512F-NEXT:    vmovdqa 64(%rdi), %xmm9
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1]
-; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm13, %ymm4
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm14, %zmm28
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm20, %zmm4
-; AVX512F-NEXT:    vpermi2q %zmm27, %zmm0, %zmm20
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm20 {%k1}
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm13
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0]
-; AVX512F-NEXT:    vmovdqa64 64(%rsi), %zmm16
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0]
-; AVX512F-NEXT:    vmovdqa64 64(%rdx), %zmm14
-; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm7, %ymm4
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm17
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm3, %zmm4
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm7 {%k1}
-; AVX512F-NEXT:    vmovdqa64 64(%rcx), %zmm4
-; AVX512F-NEXT:    vpermi2q %zmm16, %zmm13, %zmm3
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm14, %zmm11
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm7, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm6, %zmm7
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
-; AVX512F-NEXT:    vpermi2q %zmm16, %zmm13, %zmm6
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm14, %zmm12
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm9, %zmm6
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm10, %zmm7
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7]
-; AVX512F-NEXT:    vpermi2q %zmm16, %zmm13, %zmm10
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm14, %zmm2
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm26, %zmm7
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6]
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm26, %zmm13
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm19, %zmm14
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm7, %zmm4
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11]
-; AVX512F-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512F-NEXT:    vpermt2q %zmm31, %zmm7, %zmm9
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm30, %zmm7, %zmm10
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512F-NEXT:    vmovdqa (%rcx), %ymm9
-; AVX512F-NEXT:    vmovdqa (%rdx), %ymm11
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
-; AVX512F-NEXT:    vmovdqa (%rsi), %ymm13
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm14
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm12, %zmm10, %zmm10
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
-; AVX512F-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm31, %zmm12, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm30, %zmm12, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm5 {%k1}
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
-; AVX512F-NEXT:    vmovdqa 64(%rsi), %ymm11
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
-; AVX512F-NEXT:    vmovdqa 64(%rdi), %ymm9
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm5, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm7, %zmm8
-; AVX512F-NEXT:    vpermi2q %zmm27, %zmm0, %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
-; AVX512F-NEXT:    vmovdqa 64(%rcx), %ymm8
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm12, %zmm1
-; AVX512F-NEXT:    vmovdqa 64(%rdx), %ymm13
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm12, %zmm0
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm12, %zmm7, %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2]
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm0, 640(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm7, 704(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm5, 128(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm10, 192(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm4, 896(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm2, 960(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm6, 768(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm3, 832(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm17, 512(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm28, 576(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm18, 384(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm25, 448(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm24, 256(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm23, 320(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm22, (%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm21, 64(%rax)
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
+; AVX512F-ONLY-SLOW-LABEL: store_i64_stride8_vf16:
+; AVX512F-ONLY-SLOW:       # %bb.0:
+; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm30
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r10), %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rax), %zmm31
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm31, %zmm14, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm14, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    movb $-64, %al
+; AVX512F-ONLY-SLOW-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %xmm16
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %xmm17
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rsi), %xmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %xmm12
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm10, %ymm20, %ymm10
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm3, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm31, %zmm20, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm20, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm31, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm11, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm6, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm31, %zmm6, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm6, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm12 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm12, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm10, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm10, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm15, %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm26, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm26, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm19 = [6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm19, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm4, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm14, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm27, %zmm0, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm14 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rdi), %xmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm13, %ymm4
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm14, %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm20, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm27, %zmm0, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm7, %ymm4
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm3, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm7 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm16, %zmm13, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm14, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm7, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm6, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm16, %zmm13, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm14, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm9, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm10, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm16, %zmm13, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm14, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm26, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm26, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm19, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm7, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm31, %zmm7, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm7, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rcx), %ymm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %ymm11
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rsi), %ymm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %ymm14
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm10, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm31, %zmm12, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm12, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rsi), %ymm11
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm9
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm5, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm7, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm27, %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rcx), %ymm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm12, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rdx), %ymm13
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm12, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm7, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 640(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, 704(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 128(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 896(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 960(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, 768(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 832(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, 512(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, 576(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, 384(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, 448(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, 256(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, 320(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, (%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vzeroupper
+; AVX512F-ONLY-SLOW-NEXT:    retq
 ;
-; AVX512BW-LABEL: store_i64_stride8_vf16:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm7
-; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm19
-; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm9
-; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm18
-; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 64(%r8), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm30
-; AVX512BW-NEXT:    vmovdqa64 64(%r9), %zmm27
-; AVX512BW-NEXT:    vmovdqa64 (%r10), %zmm8
-; AVX512BW-NEXT:    vmovdqa64 64(%r10), %zmm1
-; AVX512BW-NEXT:    vmovdqa64 (%rax), %zmm31
-; AVX512BW-NEXT:    vmovdqa64 64(%rax), %zmm29
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm14, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm14, %zmm3
-; AVX512BW-NEXT:    movb $-64, %al
-; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-NEXT:    vmovdqa (%rcx), %xmm2
-; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %xmm16
-; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm6
-; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %xmm17
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1]
-; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm11
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm12
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm10, %ymm20, %ymm10
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm3, %zmm21
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
-; AVX512BW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm20, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm20, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k1}
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0]
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm22
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm3, %zmm2
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm6 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm3, %zmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,5,13>
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm11, %zmm10
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm6, %zmm23
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm6, %zmm2
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm6, %zmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,4,12>
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm12, %zmm15
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm24
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm10, %zmm15
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm10, %zmm13
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,7,15>
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm15, %zmm25
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm26, %zmm4
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6]
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm26, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm19 = <u,u,6,14>
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm19, %zmm9
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm4, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm14, %zmm4
-; AVX512BW-NEXT:    vpermi2q %zmm27, %zmm0, %zmm14
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm14 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1]
-; AVX512BW-NEXT:    vmovdqa 64(%rsi), %xmm7
-; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm9
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm13, %ymm4
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm14, %zmm28
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm20, %zmm4
-; AVX512BW-NEXT:    vpermi2q %zmm27, %zmm0, %zmm20
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm20 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm13
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0]
-; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm16
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0]
-; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm14
-; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm7, %ymm4
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm17
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm3, %zmm4
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm7 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm4
-; AVX512BW-NEXT:    vpermi2q %zmm16, %zmm13, %zmm3
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm14, %zmm11
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm7, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm6, %zmm7
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
-; AVX512BW-NEXT:    vpermi2q %zmm16, %zmm13, %zmm6
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm14, %zmm12
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm9, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm10, %zmm7
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7]
-; AVX512BW-NEXT:    vpermi2q %zmm16, %zmm13, %zmm10
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm14, %zmm2
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm26, %zmm7
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6]
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm26, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm19, %zmm14
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm7, %zmm4
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm7, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm7, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512BW-NEXT:    vmovdqa (%rcx), %ymm9
-; AVX512BW-NEXT:    vmovdqa (%rdx), %ymm11
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
-; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm13
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm14
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm12, %zmm10, %zmm10
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm12, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm12, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm5 {%k1}
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
-; AVX512BW-NEXT:    vmovdqa 64(%rsi), %ymm11
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
-; AVX512BW-NEXT:    vmovdqa 64(%rdi), %ymm9
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm5, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm7, %zmm8
-; AVX512BW-NEXT:    vpermi2q %zmm27, %zmm0, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
-; AVX512BW-NEXT:    vmovdqa 64(%rcx), %ymm8
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm12, %zmm1
-; AVX512BW-NEXT:    vmovdqa 64(%rdx), %ymm13
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm12, %zmm0
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm12, %zmm7, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2]
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, 640(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, 704(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, 128(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, 192(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, 896(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, 960(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, 768(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, 832(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, 512(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, 576(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, 384(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, 448(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, 256(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, 320(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, (%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, 64(%rax)
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX512F-ONLY-FAST-LABEL: store_i64_stride8_vf16:
+; AVX512F-ONLY-FAST:       # %bb.0:
+; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r10), %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r10), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rax), %zmm31
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rax), %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
+; AVX512F-ONLY-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm14, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm14, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    movb $-64, %al
+; AVX512F-ONLY-FAST-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %xmm16
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %xmm17
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rsi), %xmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdi), %xmm12
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm10, %ymm20, %ymm10
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm3, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
+; AVX512F-ONLY-FAST-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm20, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm20, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0]
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13]
+; AVX512F-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
+; AVX512F-ONLY-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm11, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm6, %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12]
+; AVX512F-ONLY-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm6, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm6, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12]
+; AVX512F-ONLY-FAST-NEXT:    # ymm12 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm12, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15]
+; AVX512F-ONLY-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm10, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm10, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15]
+; AVX512F-ONLY-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm15, %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm26, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm26, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm19 = [6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT:    # ymm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm19, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm4, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm14, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm27, %zmm0, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm14 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %xmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rdi), %xmm9
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm13, %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm14, %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm20, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm27, %zmm0, %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm7, %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm3, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm7 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm16, %zmm13, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm14, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm7, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm6, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm16, %zmm13, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm14, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm9, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm10, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm16, %zmm13, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm14, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm26, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm26, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm19, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm7, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11]
+; AVX512F-ONLY-FAST-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm7, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm7, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rcx), %ymm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdx), %ymm11
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rsi), %ymm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdi), %ymm14
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm10, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
+; AVX512F-ONLY-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm12, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm12, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %ymm11
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rdi), %ymm9
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm5, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm7, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm27, %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rcx), %ymm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm12, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %ymm13
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm7, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2]
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 640(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, 704(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 896(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 960(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 768(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 832(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, 512(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, 576(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, 384(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, 448(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, 256(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, 320(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, (%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vzeroupper
+; AVX512F-ONLY-FAST-NEXT:    retq
+;
+; AVX512DQ-SLOW-LABEL: store_i64_stride8_vf16:
+; AVX512DQ-SLOW:       # %bb.0:
+; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm19
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm18
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r8), %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r9), %zmm30
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm27
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r10), %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rax), %zmm31
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm29
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
+; AVX512DQ-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm31, %zmm14, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm30, %zmm14, %zmm3
+; AVX512DQ-SLOW-NEXT:    movb $-64, %al
+; AVX512DQ-SLOW-NEXT:    kmovw %eax, %k1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rcx), %xmm16
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdx), %xmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdx), %xmm17
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1]
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rsi), %xmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdi), %xmm12
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm10, %ymm20, %ymm10
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm3, %zmm21
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
+; AVX512DQ-SLOW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm31, %zmm20, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm30, %zmm20, %zmm10
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0]
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm22
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13]
+; AVX512DQ-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm31, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
+; AVX512DQ-SLOW-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm11, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm6, %zmm23
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12]
+; AVX512DQ-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm31, %zmm6, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm6, %zmm2
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12]
+; AVX512DQ-SLOW-NEXT:    # ymm12 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm15
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm12, %zmm15
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm24
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15]
+; AVX512DQ-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm15
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm30, %zmm10, %zmm15
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm13
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm10, %zmm13
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15]
+; AVX512DQ-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm15, %zmm25
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-SLOW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm30, %zmm26, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm26, %zmm7
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm19 = [6,14,6,14]
+; AVX512DQ-SLOW-NEXT:    # ymm19 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm19, %zmm9
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm4, %zmm18
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm14, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm27, %zmm0, %zmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm14 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1]
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rdi), %xmm9
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm13, %ymm4
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm14, %zmm28
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm20, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm27, %zmm0, %zmm20
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm13
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm16
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm14
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm7, %ymm4
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm3, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm7 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm16, %zmm13, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm4, %zmm14, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm7, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm6, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm16, %zmm13, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm4, %zmm14, %zmm12
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm9, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm27, %zmm10, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7]
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm16, %zmm13, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm4, %zmm14, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm27, %zmm26, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm26, %zmm13
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm19, %zmm14
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm7, %zmm4
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11]
+; AVX512DQ-SLOW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm31, %zmm7, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm30, %zmm7, %zmm10
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rcx), %ymm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdx), %ymm11
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rsi), %ymm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdi), %ymm14
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm10, %zmm10
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
+; AVX512DQ-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm31, %zmm12, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm30, %zmm12, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rsi), %ymm11
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm9
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm5, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm7, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm27, %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rcx), %ymm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm12, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rdx), %ymm13
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm27, %zmm12, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm7, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2]
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, 640(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, 704(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, 128(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, 192(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, 896(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, 960(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, 768(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, 832(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, 512(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, 576(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, 384(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, 448(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, 256(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, 320(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, (%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, 64(%rax)
+; AVX512DQ-SLOW-NEXT:    vzeroupper
+; AVX512DQ-SLOW-NEXT:    retq
+;
+; AVX512DQ-FAST-LABEL: store_i64_stride8_vf16:
+; AVX512DQ-FAST:       # %bb.0:
+; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdi), %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rsi), %zmm19
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdx), %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rcx), %zmm18
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r8), %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r8), %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r9), %zmm30
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r9), %zmm27
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r10), %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r10), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rax), %zmm31
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rax), %zmm29
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
+; AVX512DQ-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm31, %zmm14, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm30, %zmm14, %zmm3
+; AVX512DQ-FAST-NEXT:    movb $-64, %al
+; AVX512DQ-FAST-NEXT:    kmovw %eax, %k1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rcx), %xmm16
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdx), %xmm6
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdx), %xmm17
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1]
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rsi), %xmm11
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %xmm12
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm10, %ymm20, %ymm10
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm3, %zmm21
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
+; AVX512DQ-FAST-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm31, %zmm20, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm30, %zmm20, %zmm10
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0]
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm22
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13]
+; AVX512DQ-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm31, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
+; AVX512DQ-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm10
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm11, %zmm10
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm6, %zmm23
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12]
+; AVX512DQ-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm31, %zmm6, %zmm2
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm6, %zmm2
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12]
+; AVX512DQ-FAST-NEXT:    # ymm12 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm15
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm12, %zmm15
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm24
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15]
+; AVX512DQ-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm15
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm30, %zmm10, %zmm15
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm13
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm10, %zmm13
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15]
+; AVX512DQ-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm15, %zmm25
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm30, %zmm26, %zmm4
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm26, %zmm7
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm19 = [6,14,6,14]
+; AVX512DQ-FAST-NEXT:    # ymm19 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm19, %zmm9
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm4, %zmm18
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm14, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm27, %zmm0, %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm14 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1]
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rsi), %xmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rdi), %xmm9
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm13, %ymm4
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm14, %zmm28
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm20, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm27, %zmm0, %zmm20
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm13
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm16
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm14
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm7, %ymm4
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm3, %zmm4
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm7 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm4
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm16, %zmm13, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm14, %zmm11
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm7, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm6, %zmm7
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm16, %zmm13, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm14, %zmm12
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm9, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm27, %zmm10, %zmm7
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7]
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm16, %zmm13, %zmm10
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm14, %zmm2
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm27, %zmm26, %zmm7
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm26, %zmm13
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm19, %zmm14
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm7, %zmm4
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11]
+; AVX512DQ-FAST-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm31, %zmm7, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm30, %zmm7, %zmm10
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rcx), %ymm9
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdx), %ymm11
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rsi), %ymm13
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %ymm14
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm10, %zmm10
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
+; AVX512DQ-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm31, %zmm12, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm30, %zmm12, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rsi), %ymm11
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rdi), %ymm9
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm5, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm7, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm27, %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rcx), %ymm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm12, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rdx), %ymm13
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm27, %zmm12, %zmm0
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm7, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2]
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, 640(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, 704(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, 128(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, 192(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, 896(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, 960(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, 768(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, 832(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, 512(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, 576(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, 384(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, 448(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, 256(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, 320(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, (%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, 64(%rax)
+; AVX512DQ-FAST-NEXT:    vzeroupper
+; AVX512DQ-FAST-NEXT:    retq
+;
+; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride8_vf16:
+; AVX512BW-ONLY-SLOW:       # %bb.0:
+; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r10), %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rax), %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm31, %zmm14, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm14, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    movb $-64, %al
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %eax, %k1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %xmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %xmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rsi), %xmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %xmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm10, %ymm20, %ymm10
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm3, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm31, %zmm20, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm20, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm31, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm11, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm6, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm31, %zmm6, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm6, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm12 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm12, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm10, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm10, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm15, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm26, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm26, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm19 = [6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm19, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm4, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm14, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm27, %zmm0, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm14 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rdi), %xmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm13, %ymm4
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm14, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm20, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm27, %zmm0, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm7, %ymm4
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm3, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm7 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm16, %zmm13, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm14, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm7, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm6, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm16, %zmm13, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm14, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm9, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm10, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm16, %zmm13, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm14, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm26, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm26, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm19, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm7, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm31, %zmm7, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm7, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rcx), %ymm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %ymm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rsi), %ymm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %ymm14
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm10, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm31, %zmm12, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm12, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rsi), %ymm11
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm9
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm5, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm7, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm27, %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rcx), %ymm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm12, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rdx), %ymm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm12, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm7, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 640(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, 704(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 896(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 960(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, 768(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 832(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, 512(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, 576(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, 384(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, 448(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, 256(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, 320(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vzeroupper
+; AVX512BW-ONLY-SLOW-NEXT:    retq
+;
+; AVX512BW-ONLY-FAST-LABEL: store_i64_stride8_vf16:
+; AVX512BW-ONLY-FAST:       # %bb.0:
+; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r10), %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r10), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rax), %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rax), %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm14, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm14, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    movb $-64, %al
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %eax, %k1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %xmm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %xmm17
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rsi), %xmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdi), %xmm12
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm10, %ymm20, %ymm10
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm3, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm20, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm20, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm11, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm6, %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm6, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm6, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm12 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm12, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm10, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm10, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm15, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm26, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm26, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm19 = [6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm19, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm4, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm14, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm27, %zmm0, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm14 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %xmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rdi), %xmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm13, %ymm4
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm14, %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm20, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm27, %zmm0, %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm7, %ymm4
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm3, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm7 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm16, %zmm13, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm14, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm7, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm6, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm16, %zmm13, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm14, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm9, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm10, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm16, %zmm13, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm14, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm26, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm26, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm19, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm7, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm7, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm7, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rcx), %ymm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdx), %ymm11
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rsi), %ymm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdi), %ymm14
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm10, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm12, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm12, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %ymm11
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rdi), %ymm9
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm5, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm7, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm27, %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rcx), %ymm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm12, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %ymm13
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm7, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 640(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, 704(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, 128(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 896(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 960(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 768(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 832(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, 512(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, 576(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, 384(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, 448(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, 256(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, 320(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, (%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vzeroupper
+; AVX512BW-ONLY-FAST-NEXT:    retq
+;
+; AVX512DQBW-SLOW-LABEL: store_i64_stride8_vf16:
+; AVX512DQBW-SLOW:       # %bb.0:
+; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r8), %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r9), %zmm30
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm27
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r10), %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rax), %zmm31
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm29
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
+; AVX512DQBW-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm31, %zmm14, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm30, %zmm14, %zmm3
+; AVX512DQBW-SLOW-NEXT:    movb $-64, %al
+; AVX512DQBW-SLOW-NEXT:    kmovd %eax, %k1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rcx), %xmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdx), %xmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdx), %xmm17
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rsi), %xmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdi), %xmm12
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm10, %ymm20, %ymm10
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm3, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
+; AVX512DQBW-SLOW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm31, %zmm20, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm30, %zmm20, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0]
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13]
+; AVX512DQBW-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm31, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
+; AVX512DQBW-SLOW-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm11, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm6, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12]
+; AVX512DQBW-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm31, %zmm6, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm6, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12]
+; AVX512DQBW-SLOW-NEXT:    # ymm12 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm12, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15]
+; AVX512DQBW-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm30, %zmm10, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm10, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15]
+; AVX512DQBW-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm15, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-SLOW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm30, %zmm26, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm26, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm19 = [6,14,6,14]
+; AVX512DQBW-SLOW-NEXT:    # ymm19 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm19, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm4, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm14, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm27, %zmm0, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm14 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rdi), %xmm9
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm13, %ymm4
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm14, %zmm28
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm20, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm27, %zmm0, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm16
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm14
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm7, %ymm4
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm3, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm7 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm16, %zmm13, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm4, %zmm14, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm7, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm6, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm16, %zmm13, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm4, %zmm14, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm9, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm27, %zmm10, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7]
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm16, %zmm13, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm4, %zmm14, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm27, %zmm26, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm26, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm19, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm7, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11]
+; AVX512DQBW-SLOW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm31, %zmm7, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm30, %zmm7, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rcx), %ymm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdx), %ymm11
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rsi), %ymm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdi), %ymm14
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm10, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
+; AVX512DQBW-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm31, %zmm12, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm30, %zmm12, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rsi), %ymm11
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm9
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm5, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm7, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm27, %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rcx), %ymm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm12, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rdx), %ymm13
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm27, %zmm12, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm7, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2]
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, 640(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, 704(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, 128(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, 192(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, 896(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, 960(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, 768(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, 832(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, 512(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, 576(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, 384(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, 448(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, 256(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, 320(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, (%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, 64(%rax)
+; AVX512DQBW-SLOW-NEXT:    vzeroupper
+; AVX512DQBW-SLOW-NEXT:    retq
+;
+; AVX512DQBW-FAST-LABEL: store_i64_stride8_vf16:
+; AVX512DQBW-FAST:       # %bb.0:
+; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rsi), %zmm19
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdx), %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rcx), %zmm18
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r8), %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r8), %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r9), %zmm30
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r9), %zmm27
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r10), %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r10), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rax), %zmm31
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rax), %zmm29
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
+; AVX512DQBW-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm31, %zmm14, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm30, %zmm14, %zmm3
+; AVX512DQBW-FAST-NEXT:    movb $-64, %al
+; AVX512DQBW-FAST-NEXT:    kmovd %eax, %k1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rcx), %xmm16
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdx), %xmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdx), %xmm17
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1]
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rsi), %xmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdi), %xmm12
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm10, %ymm20, %ymm10
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm3, %zmm21
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
+; AVX512DQBW-FAST-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm31, %zmm20, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm30, %zmm20, %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0]
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm22
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13]
+; AVX512DQBW-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm31, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
+; AVX512DQBW-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm11, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm6, %zmm23
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12]
+; AVX512DQBW-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm31, %zmm6, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm6, %zmm2
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12]
+; AVX512DQBW-FAST-NEXT:    # ymm12 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm15
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm12, %zmm15
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm24
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15]
+; AVX512DQBW-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm15
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm30, %zmm10, %zmm15
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm10, %zmm13
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15]
+; AVX512DQBW-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm15, %zmm25
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm30, %zmm26, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm26, %zmm7
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm19 = [6,14,6,14]
+; AVX512DQBW-FAST-NEXT:    # ymm19 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm19, %zmm9
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm4, %zmm18
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm14, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm27, %zmm0, %zmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm14 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1]
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rsi), %xmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rdi), %xmm9
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm13, %ymm4
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm14, %zmm28
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm20, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm27, %zmm0, %zmm20
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm20 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm13
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm16
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm14
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm7, %ymm4
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm3, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm7 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm16, %zmm13, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm14, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm7, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm6, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm16, %zmm13, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm14, %zmm12
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm9, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm27, %zmm10, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7]
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm16, %zmm13, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm14, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm27, %zmm26, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm26, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm19, %zmm14
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm7, %zmm4
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11]
+; AVX512DQBW-FAST-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm31, %zmm7, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm30, %zmm7, %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rcx), %ymm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdx), %ymm11
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rsi), %ymm13
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdi), %ymm14
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm10, %zmm10
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
+; AVX512DQBW-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm31, %zmm12, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm30, %zmm12, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rsi), %ymm11
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rdi), %ymm9
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm5, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm7, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm27, %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rcx), %ymm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm12, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rdx), %ymm13
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm27, %zmm12, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm7, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2]
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, 640(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, 704(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, 128(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, 192(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, 896(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, 960(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, 768(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, 832(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, 512(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, 576(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, 384(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, 448(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, 256(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, 320(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, (%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, 64(%rax)
+; AVX512DQBW-FAST-NEXT:    vzeroupper
+; AVX512DQBW-FAST-NEXT:    retq
   %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64
   %in.vec1 = load <16 x i64>, ptr %in.vecptr1, align 64
   %in.vec2 = load <16 x i64>, ptr %in.vecptr2, align 64
@@ -4154,1001 +5448,4029 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;
-; AVX512F-LABEL: store_i64_stride8_vf32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    subq $2632, %rsp # imm = 0xA48
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT:    vmovaps 128(%rdi), %zmm0
-; AVX512F-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm6
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512F-NEXT:    vmovdqa64 128(%rsi), %zmm23
-; AVX512F-NEXT:    vmovdqa64 64(%rsi), %zmm1
-; AVX512F-NEXT:    vmovdqa64 (%rsi), %zmm3
-; AVX512F-NEXT:    vmovaps 192(%rdx), %zmm2
-; AVX512F-NEXT:    vmovups %zmm2, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vmovaps 128(%rdx), %zmm2
-; AVX512F-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 64(%rdx), %zmm8
-; AVX512F-NEXT:    vmovdqa64 (%rdx), %zmm2
-; AVX512F-NEXT:    vmovdqa64 64(%rcx), %zmm5
-; AVX512F-NEXT:    vmovdqa64 (%rcx), %zmm4
-; AVX512F-NEXT:    vmovdqa64 (%r8), %zmm22
-; AVX512F-NEXT:    vmovdqa64 64(%r8), %zmm19
-; AVX512F-NEXT:    vmovdqa64 (%r9), %zmm29
-; AVX512F-NEXT:    vmovdqa64 64(%r9), %zmm25
-; AVX512F-NEXT:    vmovdqa64 (%r10), %zmm18
-; AVX512F-NEXT:    vmovdqa64 64(%r10), %zmm17
-; AVX512F-NEXT:    vmovdqa64 (%rax), %zmm21
-; AVX512F-NEXT:    vmovdqa64 64(%rax), %zmm20
-; AVX512F-NEXT:    movb $-64, %r11b
-; AVX512F-NEXT:    kmovw %r11d, %k1
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12]
-; AVX512F-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm12, %zmm7
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm12, %zmm7
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,4,12>
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
-; AVX512F-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm14, %zmm7
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm14, %zmm7
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,5,13>
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm13, %zmm10
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
-; AVX512F-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm15, %zmm7
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm9
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm15, %zmm9
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,6,14>
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm9, %zmm7, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15]
-; AVX512F-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm26, %zmm7
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7]
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm28 = <u,u,7,15>
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm28, %zmm2
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm12, %zmm0
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,4,12>
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm4, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm14, %zmm0
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm14, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm13, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm15, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm11, %zmm2
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm15, %zmm2
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 128(%rcx), %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm26, %zmm6
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm26, %zmm1
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7]
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm28, %zmm8
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm4, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm7, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm11, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
-; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 192(%rcx), %zmm0
-; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2q %zmm0, %zmm13, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm0, %zmm13, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm0, %zmm13, %zmm11
-; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
-; AVX512F-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm23, %zmm12, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm23, %zmm14, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm23, %zmm15, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm23, %zmm26, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 128(%r10), %zmm28
-; AVX512F-NEXT:    vmovdqa64 128(%rax), %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm12, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm14, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 128(%r8), %zmm16
-; AVX512F-NEXT:    vmovdqa64 128(%r9), %zmm13
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm15, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm9
-; AVX512F-NEXT:    vmovdqa64 192(%r10), %zmm23
-; AVX512F-NEXT:    vmovdqa64 192(%rax), %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqa64 192(%rsi), %zmm0
-; AVX512F-NEXT:    vpermi2q %zmm0, %zmm2, %zmm12
-; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm14, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm0, %zmm2, %zmm14
-; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 192(%r8), %zmm31
-; AVX512F-NEXT:    vmovdqa64 192(%r9), %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm15, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512F-NEXT:    vpermi2q %zmm0, %zmm2, %zmm15
-; AVX512F-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm26, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm27
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm26, %zmm27
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
-; AVX512F-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm30
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm5, %zmm30
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
-; AVX512F-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm6, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10]
-; AVX512F-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm26, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
-; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm0, %zmm18
-; AVX512F-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm24
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm5, %zmm24
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm6, %zmm22
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm21
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm26, %zmm21
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm0, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm29
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm5, %zmm29
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm6, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm26, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm0, %zmm17
-; AVX512F-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm5, %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm6, %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm20
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm26, %zmm20
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm0, %zmm19
-; AVX512F-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm19
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm5, %zmm19
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm25
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm6, %zmm25
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
-; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm9
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm26, %zmm9
-; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm0, %zmm28
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm9
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm5, %zmm9
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm15
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm6, %zmm15
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm12
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm26, %zmm12
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm0, %zmm16
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm13
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm5, %zmm13
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm18
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm6, %zmm18
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6]
-; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7]
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm26, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm0, %zmm23
-; AVX512F-NEXT:    vpermi2q %zmm3, %zmm31, %zmm5
-; AVX512F-NEXT:    vpermi2q %zmm3, %zmm31, %zmm6
-; AVX512F-NEXT:    vpermi2q %zmm3, %zmm31, %zmm26
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm0, %zmm31
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm24 {%k1}
-; AVX512F-NEXT:    vmovdqa (%rcx), %xmm3
-; AVX512F-NEXT:    vmovdqa (%rdx), %xmm2
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0]
-; AVX512F-NEXT:    vmovdqa (%rsi), %xmm1
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm4, %ymm30, %ymm4
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm24, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm22 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm7 {%k1}
-; AVX512F-NEXT:    vmovdqa 64(%rcx), %xmm0
-; AVX512F-NEXT:    vmovdqa 64(%rdx), %xmm1
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX512F-NEXT:    vmovdqa 64(%rsi), %xmm4
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %xmm29
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm2, %ymm22, %ymm2
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm8 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1]
-; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm30
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm9 {%k1}
-; AVX512F-NEXT:    vmovdqa 128(%rcx), %xmm1
-; AVX512F-NEXT:    vmovdqa 128(%rdx), %xmm2
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0]
-; AVX512F-NEXT:    vmovdqa 128(%rsi), %xmm7
-; AVX512F-NEXT:    vmovdqa 128(%rdi), %xmm8
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm4, %ymm19, %ymm4
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm9, %zmm29
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm15 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1]
-; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm15, %zmm24
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm14, %zmm22
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm17, %zmm7
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vinsertf64x4 $0, %ymm8, %zmm0, %zmm8
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm9, %zmm27, %zmm27
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
-; AVX512F-NEXT:    vmovdqa 192(%rcx), %xmm13
-; AVX512F-NEXT:    vmovdqa 192(%rdx), %xmm14
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0]
-; AVX512F-NEXT:    vmovdqa64 192(%rsi), %xmm17
-; AVX512F-NEXT:    vmovdqa64 192(%rdi), %xmm19
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm15, %ymm25, %ymm15
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm25
-; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm6 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1]
-; AVX512F-NEXT:    vinserti128 $1, %xmm13, %ymm14, %ymm13
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm6, %zmm6
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm10, %zmm10
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm11, %zmm11
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vinsertf64x4 $0, %ymm13, %zmm0, %zmm13
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vinsertf64x4 $0, %ymm14, %zmm0, %zmm14
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
-; AVX512F-NEXT:    vmovdqa (%rcx), %ymm15
-; AVX512F-NEXT:    vmovdqa64 (%rdx), %ymm17
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
-; AVX512F-NEXT:    vmovdqa64 (%rsi), %ymm18
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %ymm19
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm21, %zmm21
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k1}
-; AVX512F-NEXT:    vmovdqa 64(%rsi), %ymm15
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %ymm17
-; AVX512F-NEXT:    vmovdqa64 64(%rcx), %ymm18
-; AVX512F-NEXT:    vmovdqa64 64(%rdx), %ymm19
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm4
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm9, %zmm1, %zmm9
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
-; AVX512F-NEXT:    vmovdqa 128(%rcx), %ymm15
-; AVX512F-NEXT:    vmovdqa64 128(%rdx), %ymm17
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
-; AVX512F-NEXT:    vmovdqa64 128(%rsi), %ymm18
-; AVX512F-NEXT:    vmovdqa64 128(%rdi), %ymm19
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm12, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm16 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm16, %zmm5
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k1}
-; AVX512F-NEXT:    vmovdqa 192(%rcx), %ymm12
-; AVX512F-NEXT:    vmovdqa 192(%rdx), %ymm15
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
-; AVX512F-NEXT:    vmovdqa64 192(%rsi), %ymm16
-; AVX512F-NEXT:    vmovdqa64 192(%rdi), %ymm17
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm26, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm31 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm31, %zmm2
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm2, 1728(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm0, 1664(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm5, 1216(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm1, 1152(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm9, 704(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm4, 640(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm3, 192(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm21, 128(%rax)
-; AVX512F-NEXT:    vmovaps %zmm14, 1984(%rax)
-; AVX512F-NEXT:    vmovaps %zmm13, 1920(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm11, 1856(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm10, 1792(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm6, 1600(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm25, 1536(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm27, 1472(%rax)
-; AVX512F-NEXT:    vmovaps %zmm8, 1408(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm7, 1344(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm22, 1280(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm24, 1088(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm29, 1024(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 960(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 896(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 832(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 768(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm30, 576(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 512(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512F-NEXT:    addq $2632, %rsp # imm = 0xA48
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
+; AVX512F-ONLY-SLOW-LABEL: store_i64_stride8_vf32:
+; AVX512F-ONLY-SLOW:       # %bb.0:
+; AVX512F-ONLY-SLOW-NEXT:    subq $2632, %rsp # imm = 0xA48
+; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps 128(%rdi), %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps 192(%rdx), %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovups %zmm2, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps 128(%rdx), %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r10), %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rax), %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    movb $-64, %r11b
+; AVX512F-ONLY-SLOW-NEXT:    kmovw %r11d, %k1
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm12, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm14, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm15, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm15, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm7, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm26, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm28 = [7,15,7,15]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm28 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm28, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm12, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm4, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm14, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm14, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm13, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm15, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm15, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm26, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm26, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm28, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm4, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm7, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm13, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm13, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm13, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm12, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm14, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm15, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm26, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%r10), %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm12, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm14, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm15, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%r10), %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm14, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm31
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm15, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm26, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm26, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm30
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm5, %zmm30
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm26, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm5, %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm6, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm26, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm5, %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm6, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm26, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm5, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm6, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm26, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm26, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm5, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm6, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm26, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm0, %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm5, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm26, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm31
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm24 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rcx), %xmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm30, %ymm4
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm24, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm22 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm7 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rcx), %xmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %xmm29
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm22, %ymm2
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm30
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rcx), %xmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rsi), %xmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rdi), %xmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm19, %ymm4
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm9, %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm15 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm15, %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm14, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm17, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinsertf64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm27, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rcx), %xmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rdx), %xmm14
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %xmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %xmm19
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm15, %ymm25, %ymm15
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm6 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm13, %ymm14, %ymm13
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm6, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm10, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm11, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinsertf64x4 $0, %ymm13, %zmm0, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinsertf64x4 $0, %ymm14, %zmm0, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rcx), %ymm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %ymm17
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %ymm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %ymm19
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm21, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rsi), %ymm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %ymm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %ymm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %ymm19
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm1, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rcx), %ymm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdx), %ymm17
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rsi), %ymm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %ymm19
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm12, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm16 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm16, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rcx), %ymm12
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rdx), %ymm15
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %ymm16
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %ymm17
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm31 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 1728(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 1664(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 1216(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 1152(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 704(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 640(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, 128(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm14, 1984(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm13, 1920(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, 1856(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 1792(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, 1600(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, 1536(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, 1472(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm8, 1408(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, 1344(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, 1280(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, 1088(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, 1024(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 960(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 896(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 832(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 768(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, 576(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 512(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512F-ONLY-SLOW-NEXT:    addq $2632, %rsp # imm = 0xA48
+; AVX512F-ONLY-SLOW-NEXT:    vzeroupper
+; AVX512F-ONLY-SLOW-NEXT:    retq
 ;
-; AVX512BW-LABEL: store_i64_stride8_vf32:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    subq $2632, %rsp # imm = 0xA48
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-NEXT:    vmovaps 128(%rdi), %zmm0
-; AVX512BW-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm6
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 128(%rsi), %zmm23
-; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm1
-; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm3
-; AVX512BW-NEXT:    vmovaps 192(%rdx), %zmm2
-; AVX512BW-NEXT:    vmovups %zmm2, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vmovaps 128(%rdx), %zmm2
-; AVX512BW-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm8
-; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm4
-; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm22
-; AVX512BW-NEXT:    vmovdqa64 64(%r8), %zmm19
-; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm29
-; AVX512BW-NEXT:    vmovdqa64 64(%r9), %zmm25
-; AVX512BW-NEXT:    vmovdqa64 (%r10), %zmm18
-; AVX512BW-NEXT:    vmovdqa64 64(%r10), %zmm17
-; AVX512BW-NEXT:    vmovdqa64 (%rax), %zmm21
-; AVX512BW-NEXT:    vmovdqa64 64(%rax), %zmm20
-; AVX512BW-NEXT:    movb $-64, %r11b
-; AVX512BW-NEXT:    kmovd %r11d, %k1
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm12, %zmm7
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm7
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,4,12>
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm14, %zmm7
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm7
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,5,13>
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm10
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm15, %zmm7
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm9
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm15, %zmm9
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,6,14>
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm7, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm26, %zmm7
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7]
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm28 = <u,u,7,15>
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm28, %zmm2
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm12, %zmm0
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,4,12>
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm4, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm14, %zmm0
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm14, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm13, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm15, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm2
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm15, %zmm2
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 128(%rcx), %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm26, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm26, %zmm1
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7]
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm28, %zmm8
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm4, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm7, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 192(%rcx), %zmm0
-; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm13, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm13, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm13, %zmm11
-; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm12, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm14, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm15, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm26, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 128(%r10), %zmm28
-; AVX512BW-NEXT:    vmovdqa64 128(%rax), %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm12, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm14, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 128(%r8), %zmm16
-; AVX512BW-NEXT:    vmovdqa64 128(%r9), %zmm13
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm15, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 192(%r10), %zmm23
-; AVX512BW-NEXT:    vmovdqa64 192(%rax), %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 192(%rsi), %zmm0
-; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm12
-; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm14, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm14
-; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 192(%r8), %zmm31
-; AVX512BW-NEXT:    vmovdqa64 192(%r9), %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm15, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm15
-; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm26, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm27
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm26, %zmm27
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
-; AVX512BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm30
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm5, %zmm30
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm6, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm26, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm18
-; AVX512BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm24
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm5, %zmm24
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm6, %zmm22
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm21
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm26, %zmm21
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm0, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm29
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm5, %zmm29
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm6, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm26, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm17
-; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm5, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm6, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm20
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm26, %zmm20
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm19
-; AVX512BW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm19
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm19
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm25
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm25
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm9
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm26, %zmm9
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm28
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm9
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm5, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm6, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm12
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm26, %zmm12
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm0, %zmm16
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm5, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm18
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm18
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6]
-; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7]
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm26, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm23
-; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm5
-; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm6
-; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm26
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm31
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm24 {%k1}
-; AVX512BW-NEXT:    vmovdqa (%rcx), %xmm3
-; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm2
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0]
-; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm1
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm4, %ymm30, %ymm4
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm24, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm22 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm7 {%k1}
-; AVX512BW-NEXT:    vmovdqa 64(%rcx), %xmm0
-; AVX512BW-NEXT:    vmovdqa 64(%rdx), %xmm1
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX512BW-NEXT:    vmovdqa 64(%rsi), %xmm4
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %xmm29
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm2, %ymm22, %ymm2
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm8 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm30
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm9 {%k1}
-; AVX512BW-NEXT:    vmovdqa 128(%rcx), %xmm1
-; AVX512BW-NEXT:    vmovdqa 128(%rdx), %xmm2
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0]
-; AVX512BW-NEXT:    vmovdqa 128(%rsi), %xmm7
-; AVX512BW-NEXT:    vmovdqa 128(%rdi), %xmm8
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm4, %ymm19, %ymm4
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm9, %zmm29
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm15 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm15, %zmm24
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm14, %zmm22
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm17, %zmm7
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vinsertf64x4 $0, %ymm8, %zmm0, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm27, %zmm27
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqa 192(%rcx), %xmm13
-; AVX512BW-NEXT:    vmovdqa 192(%rdx), %xmm14
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0]
-; AVX512BW-NEXT:    vmovdqa64 192(%rsi), %xmm17
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %xmm19
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm15, %ymm25, %ymm15
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm25
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm6 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm13, %ymm14, %ymm13
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm6, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm10, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm11, %zmm11
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vinsertf64x4 $0, %ymm13, %zmm0, %zmm13
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vinsertf64x4 $0, %ymm14, %zmm0, %zmm14
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
-; AVX512BW-NEXT:    vmovdqa (%rcx), %ymm15
-; AVX512BW-NEXT:    vmovdqa64 (%rdx), %ymm17
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
-; AVX512BW-NEXT:    vmovdqa64 (%rsi), %ymm18
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %ymm19
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm21, %zmm21
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k1}
-; AVX512BW-NEXT:    vmovdqa 64(%rsi), %ymm15
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %ymm17
-; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %ymm18
-; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %ymm19
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm1, %zmm9
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
-; AVX512BW-NEXT:    vmovdqa 128(%rcx), %ymm15
-; AVX512BW-NEXT:    vmovdqa64 128(%rdx), %ymm17
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
-; AVX512BW-NEXT:    vmovdqa64 128(%rsi), %ymm18
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %ymm19
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm12, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm16 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm16, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k1}
-; AVX512BW-NEXT:    vmovdqa 192(%rcx), %ymm12
-; AVX512BW-NEXT:    vmovdqa 192(%rdx), %ymm15
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
-; AVX512BW-NEXT:    vmovdqa64 192(%rsi), %ymm16
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %ymm17
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm26, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm31 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm31, %zmm2
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, 1728(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, 1664(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, 1216(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, 1152(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, 704(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, 640(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, 192(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, 128(%rax)
-; AVX512BW-NEXT:    vmovaps %zmm14, 1984(%rax)
-; AVX512BW-NEXT:    vmovaps %zmm13, 1920(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, 1856(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, 1792(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, 1600(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, 1536(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, 1472(%rax)
-; AVX512BW-NEXT:    vmovaps %zmm8, 1408(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, 1344(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, 1280(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, 1088(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, 1024(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 960(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 896(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 832(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 768(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, 576(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 512(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512BW-NEXT:    addq $2632, %rsp # imm = 0xA48
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX512F-ONLY-FAST-LABEL: store_i64_stride8_vf32:
+; AVX512F-ONLY-FAST:       # %bb.0:
+; AVX512F-ONLY-FAST-NEXT:    subq $2632, %rsp # imm = 0xA48
+; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-ONLY-FAST-NEXT:    vmovaps 128(%rdi), %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovaps 192(%rdx), %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovups %zmm2, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovaps 128(%rdx), %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r10), %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r10), %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rax), %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rax), %zmm20
+; AVX512F-ONLY-FAST-NEXT:    movb $-64, %r11b
+; AVX512F-ONLY-FAST-NEXT:    kmovw %r11d, %k1
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12]
+; AVX512F-ONLY-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm12, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12]
+; AVX512F-ONLY-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
+; AVX512F-ONLY-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm14, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13]
+; AVX512F-ONLY-FAST-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm13, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm15, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm15, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm7, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15]
+; AVX512F-ONLY-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm26, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm28 = [7,15,7,15]
+; AVX512F-ONLY-FAST-NEXT:    # ymm28 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm28, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12]
+; AVX512F-ONLY-FAST-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm4, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm14, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm14, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm13, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm15, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm11, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm15, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm26, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm26, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm28, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm4, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm7, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm11, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm13, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm13, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm13, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm12, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm14, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm15, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm26, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%r10), %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rax), %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm14, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%r8), %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%r9), %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm15, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%r10), %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rax), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm14, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%r8), %zmm31
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%r9), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm15, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm26, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm26, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512F-ONLY-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm5, %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512F-ONLY-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10]
+; AVX512F-ONLY-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm26, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
+; AVX512F-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm0, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm5, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm6, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm26, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm5, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm6, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm26, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm5, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm6, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm26, %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm26, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm5, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm6, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm26, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm0, %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm5, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm26, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm0, %zmm31
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm24 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rcx), %xmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm30, %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm24, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm22 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm7 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rcx), %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %xmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %xmm29
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm22, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rcx), %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rdx), %xmm2
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rsi), %xmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rdi), %xmm8
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm19, %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm9, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm15 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm15, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm14, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm17, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinsertf64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm27, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rcx), %xmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rdx), %xmm14
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %xmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %xmm19
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm15, %ymm25, %ymm15
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm6 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm13, %ymm14, %ymm13
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm6, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm10, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm11, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinsertf64x4 $0, %ymm13, %zmm0, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinsertf64x4 $0, %ymm14, %zmm0, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rcx), %ymm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %ymm17
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %ymm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %ymm19
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm21, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %ymm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %ymm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %ymm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %ymm19
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm1, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rcx), %ymm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rdx), %ymm17
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rsi), %ymm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %ymm19
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm12, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm16 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm16, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rcx), %ymm12
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rdx), %ymm15
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %ymm16
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %ymm17
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm31 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 1728(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 1664(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, 1216(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 1152(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 704(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 640(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm14, 1984(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm13, 1920(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, 1856(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 1792(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 1600(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, 1536(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, 1472(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm8, 1408(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, 1344(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, 1280(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, 1088(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, 1024(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 960(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 896(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 832(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 768(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, 576(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 512(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512F-ONLY-FAST-NEXT:    addq $2632, %rsp # imm = 0xA48
+; AVX512F-ONLY-FAST-NEXT:    vzeroupper
+; AVX512F-ONLY-FAST-NEXT:    retq
+;
+; AVX512DQ-SLOW-LABEL: store_i64_stride8_vf32:
+; AVX512DQ-SLOW:       # %bb.0:
+; AVX512DQ-SLOW-NEXT:    subq $2632, %rsp # imm = 0xA48
+; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512DQ-SLOW-NEXT:    vmovaps 128(%rdi), %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm23
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovaps 192(%rdx), %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovups %zmm2, (%rsp) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovaps 128(%rdx), %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r8), %zmm22
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm19
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r9), %zmm29
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm25
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r10), %zmm18
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rax), %zmm21
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm20
+; AVX512DQ-SLOW-NEXT:    movb $-64, %r11b
+; AVX512DQ-SLOW-NEXT:    kmovw %r11d, %k1
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12]
+; AVX512DQ-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm12, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm7
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12]
+; AVX512DQ-SLOW-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
+; AVX512DQ-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm14, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm7
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13]
+; AVX512DQ-SLOW-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-SLOW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm15, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm15, %zmm9
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14]
+; AVX512DQ-SLOW-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm7, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15]
+; AVX512DQ-SLOW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm26, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm28 = [7,15,7,15]
+; AVX512DQ-SLOW-NEXT:    # ymm28 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm28, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm12, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12]
+; AVX512DQ-SLOW-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm4, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm14, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm14, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm13, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm15, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm15, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm26, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm26, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm28, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm4, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm7, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm13, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm13, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm13, %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm23, %zmm12, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm23, %zmm14, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm23, %zmm15, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm23, %zmm26, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%r10), %zmm28
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm12, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm14, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm16
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm15, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%r10), %zmm23
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm12
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm14, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm31
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm15, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm26, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm27
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm26, %zmm27
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512DQ-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm30
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm5, %zmm30
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512DQ-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10]
+; AVX512DQ-SLOW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm26, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
+; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm18
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm24
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm5, %zmm24
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm6, %zmm22
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm21
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm26, %zmm21
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm29
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm5, %zmm29
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm6, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm26, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm5, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm6, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm20
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm26, %zmm20
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm19
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm19
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm19
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm25
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm25
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm9
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm26, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm28
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm9
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm5, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm15
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm6, %zmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm12
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm26, %zmm12
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm0, %zmm16
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm13
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm5, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm18
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm18
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm26, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm23
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm26
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm31
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm24 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rcx), %xmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0]
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm30, %ymm4
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm24, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm22 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm7 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rcx), %xmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm1
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdi), %xmm29
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm22, %ymm2
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm30
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rcx), %xmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm2
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0]
+; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rsi), %xmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rdi), %xmm8
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm19, %ymm4
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm9, %zmm29
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm15 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm15, %zmm24
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm14, %zmm22
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm17, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vinsertf64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm27, %zmm27
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rcx), %xmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rdx), %xmm14
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rsi), %xmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdi), %xmm19
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm15, %ymm25, %ymm15
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm25
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm6 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm13, %ymm14, %ymm13
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm6, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm10, %zmm10
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm11, %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vinsertf64x4 $0, %ymm13, %zmm0, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vinsertf64x4 $0, %ymm14, %zmm0, %zmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rcx), %ymm15
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdx), %ymm17
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rsi), %ymm18
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdi), %ymm19
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm21, %zmm21
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rsi), %ymm15
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdi), %ymm17
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rcx), %ymm18
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdx), %ymm19
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm1, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rcx), %ymm15
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rdx), %ymm17
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rsi), %ymm18
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rdi), %ymm19
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm12, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm16 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm16, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rcx), %ymm12
+; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rdx), %ymm15
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rsi), %ymm16
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdi), %ymm17
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm31 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, 1728(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, 1664(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, 1216(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, 1152(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, 704(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, 640(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, 192(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, 128(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm14, 1984(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm13, 1920(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, 1856(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, 1792(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, 1600(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, 1536(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm27, 1472(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm8, 1408(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, 1344(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, 1280(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, 1088(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, 1024(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 960(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 896(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 832(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 768(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, 576(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 512(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512DQ-SLOW-NEXT:    addq $2632, %rsp # imm = 0xA48
+; AVX512DQ-SLOW-NEXT:    vzeroupper
+; AVX512DQ-SLOW-NEXT:    retq
+;
+; AVX512DQ-FAST-LABEL: store_i64_stride8_vf32:
+; AVX512DQ-FAST:       # %bb.0:
+; AVX512DQ-FAST-NEXT:    subq $2632, %rsp # imm = 0xA48
+; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512DQ-FAST-NEXT:    vmovaps 128(%rdi), %zmm0
+; AVX512DQ-FAST-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm23
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rsi), %zmm3
+; AVX512DQ-FAST-NEXT:    vmovaps 192(%rdx), %zmm2
+; AVX512DQ-FAST-NEXT:    vmovups %zmm2, (%rsp) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovaps 128(%rdx), %zmm2
+; AVX512DQ-FAST-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdx), %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rcx), %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r8), %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r8), %zmm19
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r9), %zmm29
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r9), %zmm25
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r10), %zmm18
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r10), %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rax), %zmm21
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rax), %zmm20
+; AVX512DQ-FAST-NEXT:    movb $-64, %r11b
+; AVX512DQ-FAST-NEXT:    kmovw %r11d, %k1
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12]
+; AVX512DQ-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm12, %zmm7
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm7
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12]
+; AVX512DQ-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
+; AVX512DQ-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm14, %zmm7
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm7
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13]
+; AVX512DQ-FAST-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm13, %zmm10
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-FAST-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm15, %zmm7
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm15, %zmm9
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14]
+; AVX512DQ-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm7, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15]
+; AVX512DQ-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm26, %zmm7
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm28 = [7,15,7,15]
+; AVX512DQ-FAST-NEXT:    # ymm28 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm28, %zmm2
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm12, %zmm0
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12]
+; AVX512DQ-FAST-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm4, %zmm3
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm14, %zmm0
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm14, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm13, %zmm3
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm15, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm11, %zmm2
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm15, %zmm2
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm26, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm26, %zmm1
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm28, %zmm8
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm4, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm7, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm11, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm13, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm13, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm13, %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm12, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm14, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm15, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm26, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%r10), %zmm28
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rax), %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm12, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm14, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%r8), %zmm16
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%r9), %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm15, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%r10), %zmm23
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rax), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm0
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm14, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%r8), %zmm31
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%r9), %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm15, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm15
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm26, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm27
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm26, %zmm27
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512DQ-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm30
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm5, %zmm30
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512DQ-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10]
+; AVX512DQ-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm26, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
+; AVX512DQ-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm0, %zmm18
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, %zmm24
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm5, %zmm24
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm6, %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm21
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm26, %zmm21
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm29
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm5, %zmm29
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm6, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm26, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm5, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm6, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm20
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm26, %zmm20
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm19
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm19
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm19
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm25
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm25
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm9
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm26, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm28
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm9
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm5, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm15
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm6, %zmm15
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm12
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm26, %zmm12
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm0, %zmm16
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm13
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm5, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm18
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm18
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm26, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm23
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm26
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm0, %zmm31
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, %zmm24 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rcx), %xmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0]
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm30, %ymm4
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm24, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm22 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, %zmm7 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rcx), %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rdx), %xmm1
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rsi), %xmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %xmm29
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm22, %ymm2
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm30
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rcx), %xmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rdx), %xmm2
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0]
+; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rsi), %xmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rdi), %xmm8
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm19, %ymm4
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm9, %zmm29
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, %zmm15 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm15, %zmm24
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm14, %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm17, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinsertf64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm27, %zmm27
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rcx), %xmm13
+; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rdx), %xmm14
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rsi), %xmm17
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rdi), %xmm19
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm15, %ymm25, %ymm15
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm25
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm6 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm13, %ymm14, %ymm13
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm6, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm10, %zmm10
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm11, %zmm11
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinsertf64x4 $0, %ymm13, %zmm0, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinsertf64x4 $0, %ymm14, %zmm0, %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rcx), %ymm15
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdx), %ymm17
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rsi), %ymm18
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdi), %ymm19
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm21, %zmm21
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rsi), %ymm15
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %ymm17
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rcx), %ymm18
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdx), %ymm19
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm1, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rcx), %ymm15
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rdx), %ymm17
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rsi), %ymm18
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rdi), %ymm19
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm12, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm16 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm16, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rcx), %ymm12
+; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rdx), %ymm15
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rsi), %ymm16
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rdi), %ymm17
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm31 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, 1728(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, 1664(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, 1216(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, 1152(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, 704(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, 640(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, 192(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, 128(%rax)
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm14, 1984(%rax)
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm13, 1920(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, 1856(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, 1792(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, 1600(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, 1536(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, 1472(%rax)
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm8, 1408(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, 1344(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, 1280(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, 1088(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, 1024(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 960(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 896(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 832(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 768(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, 576(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 512(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512DQ-FAST-NEXT:    addq $2632, %rsp # imm = 0xA48
+; AVX512DQ-FAST-NEXT:    vzeroupper
+; AVX512DQ-FAST-NEXT:    retq
+;
+; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride8_vf32:
+; AVX512BW-ONLY-SLOW:       # %bb.0:
+; AVX512BW-ONLY-SLOW-NEXT:    subq $2632, %rsp # imm = 0xA48
+; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps 128(%rdi), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps 192(%rdx), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups %zmm2, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps 128(%rdx), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r10), %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rax), %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    movb $-64, %r11b
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %r11d, %k1
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm12, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm14, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm15, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm15, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm7, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm26, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm28 = [7,15,7,15]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm28 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm28, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm12, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm4, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm14, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm14, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm13, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm15, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm15, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm26, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm26, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm28, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm4, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm7, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm13, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm13, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm13, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm12, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm14, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm15, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm26, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%r10), %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm12, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm14, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm15, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%r10), %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm14, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm15, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm26, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm26, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm5, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm26, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm5, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm6, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm26, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm5, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm6, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm26, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm5, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm6, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm26, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm26, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm5, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm6, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm26, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm0, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm5, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm26, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm24 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rcx), %xmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm30, %ymm4
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm24, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm22 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm7 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rcx), %xmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %xmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm22, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rcx), %xmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rsi), %xmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rdi), %xmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm19, %ymm4
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm9, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm15 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm15, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm14, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm17, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinsertf64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm27, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rcx), %xmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rdx), %xmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %xmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %xmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm15, %ymm25, %ymm15
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm6 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm13, %ymm14, %ymm13
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm6, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm10, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm11, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinsertf64x4 $0, %ymm13, %zmm0, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinsertf64x4 $0, %ymm14, %zmm0, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rcx), %ymm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %ymm17
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %ymm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %ymm19
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm21, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rsi), %ymm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %ymm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %ymm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %ymm19
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm1, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rcx), %ymm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdx), %ymm17
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rsi), %ymm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %ymm19
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm12, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm16 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm16, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rcx), %ymm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rdx), %ymm15
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %ymm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %ymm17
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm31 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 1728(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 1664(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 1216(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 1152(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 704(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 640(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm14, 1984(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm13, 1920(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, 1856(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 1792(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, 1600(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, 1536(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, 1472(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm8, 1408(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, 1344(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, 1280(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, 1088(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, 1024(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 960(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 896(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 832(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 768(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, 576(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 512(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    addq $2632, %rsp # imm = 0xA48
+; AVX512BW-ONLY-SLOW-NEXT:    vzeroupper
+; AVX512BW-ONLY-SLOW-NEXT:    retq
+;
+; AVX512BW-ONLY-FAST-LABEL: store_i64_stride8_vf32:
+; AVX512BW-ONLY-FAST:       # %bb.0:
+; AVX512BW-ONLY-FAST-NEXT:    subq $2632, %rsp # imm = 0xA48
+; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps 128(%rdi), %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps 192(%rdx), %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovups %zmm2, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps 128(%rdx), %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r10), %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r10), %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rax), %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rax), %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    movb $-64, %r11b
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %r11d, %k1
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm12, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm14, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm13, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm15, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm15, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm7, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm26, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm28 = [7,15,7,15]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm28 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm28, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm4, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm14, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm14, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm13, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm15, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm11, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm15, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm26, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm26, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm28, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm4, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm7, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm11, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm13, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm13, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm13, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm12, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm14, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm15, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm26, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%r10), %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rax), %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm14, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%r8), %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%r9), %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm15, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%r10), %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rax), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm14, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%r8), %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%r9), %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm15, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm26, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm26, %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm5, %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm26, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm0, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm5, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm6, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm26, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm5, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm6, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm26, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm5, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm6, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm26, %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm26, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm5, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm6, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm26, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm0, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm5, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm26, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm0, %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm24 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rcx), %xmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm30, %ymm4
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm24, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm22 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm7 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rcx), %xmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %xmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %xmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %xmm29
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm22, %ymm2
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rcx), %xmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rdx), %xmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rsi), %xmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rdi), %xmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm19, %ymm4
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm9, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm15 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm15, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm14, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm17, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinsertf64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm27, %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rcx), %xmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rdx), %xmm14
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %xmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %xmm19
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm15, %ymm25, %ymm15
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm6 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm13, %ymm14, %ymm13
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm6, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm10, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm11, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinsertf64x4 $0, %ymm13, %zmm0, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinsertf64x4 $0, %ymm14, %zmm0, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rcx), %ymm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %ymm17
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %ymm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %ymm19
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm21, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %ymm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %ymm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %ymm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %ymm19
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm1, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rcx), %ymm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rdx), %ymm17
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rsi), %ymm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %ymm19
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm12, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm16 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm16, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rcx), %ymm12
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rdx), %ymm15
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %ymm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %ymm17
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm31 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 1728(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 1664(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, 1216(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 1152(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 704(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 640(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, 128(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm14, 1984(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm13, 1920(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, 1856(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 1792(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 1600(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, 1536(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, 1472(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm8, 1408(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, 1344(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, 1280(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, 1088(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, 1024(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 960(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 896(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 832(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 768(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, 576(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 512(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512BW-ONLY-FAST-NEXT:    addq $2632, %rsp # imm = 0xA48
+; AVX512BW-ONLY-FAST-NEXT:    vzeroupper
+; AVX512BW-ONLY-FAST-NEXT:    retq
+;
+; AVX512DQBW-SLOW-LABEL: store_i64_stride8_vf32:
+; AVX512DQBW-SLOW:       # %bb.0:
+; AVX512DQBW-SLOW-NEXT:    subq $2632, %rsp # imm = 0xA48
+; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512DQBW-SLOW-NEXT:    vmovaps 128(%rdi), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm23
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovaps 192(%rdx), %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovups %zmm2, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovaps 128(%rdx), %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r8), %zmm22
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r9), %zmm29
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm25
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r10), %zmm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rax), %zmm21
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm20
+; AVX512DQBW-SLOW-NEXT:    movb $-64, %r11b
+; AVX512DQBW-SLOW-NEXT:    kmovd %r11d, %k1
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12]
+; AVX512DQBW-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm12, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12]
+; AVX512DQBW-SLOW-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
+; AVX512DQBW-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm14, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13]
+; AVX512DQBW-SLOW-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-SLOW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm15, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm15, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14]
+; AVX512DQBW-SLOW-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm7, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15]
+; AVX512DQBW-SLOW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm26, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm28 = [7,15,7,15]
+; AVX512DQBW-SLOW-NEXT:    # ymm28 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm28, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm12, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12]
+; AVX512DQBW-SLOW-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm4, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm14, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm14, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm13, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm15, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm15, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm26, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm26, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm28, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm4, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm7, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm13, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm13, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm13, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm23, %zmm12, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm23, %zmm14, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm23, %zmm15, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm23, %zmm26, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%r10), %zmm28
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm12, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm14, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm15, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%r10), %zmm23
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm14, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm31
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm15, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm26, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm26, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512DQBW-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm30
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm5, %zmm30
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512DQBW-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10]
+; AVX512DQBW-SLOW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm26, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
+; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm5, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm6, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm26, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm5, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm6, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm26, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm5, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm6, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm26, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm26, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm28
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm5, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm6, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm26, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm0, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm5, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm26, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm26
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm31
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm24 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rcx), %xmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm30, %ymm4
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm24, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm22 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm7 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rcx), %xmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm1
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %xmm29
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm22, %ymm2
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm30
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rcx), %xmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm2
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rsi), %xmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rdi), %xmm8
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm19, %ymm4
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm9, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm15 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm15, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm14, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm17, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vinsertf64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm27, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rcx), %xmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rdx), %xmm14
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rsi), %xmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdi), %xmm19
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm15, %ymm25, %ymm15
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm6 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm13, %ymm14, %ymm13
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm6, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm10, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm11, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vinsertf64x4 $0, %ymm13, %zmm0, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vinsertf64x4 $0, %ymm14, %zmm0, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rcx), %ymm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdx), %ymm17
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rsi), %ymm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdi), %ymm19
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm21, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rsi), %ymm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %ymm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rcx), %ymm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdx), %ymm19
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm1, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rcx), %ymm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rdx), %ymm17
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rsi), %ymm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rdi), %ymm19
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm12, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm16 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm16, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rcx), %ymm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rdx), %ymm15
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rsi), %ymm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdi), %ymm17
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm31 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, 1728(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, 1664(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, 1216(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, 1152(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, 704(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, 640(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, 192(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, 128(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm14, 1984(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm13, 1920(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, 1856(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, 1792(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, 1600(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, 1536(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm27, 1472(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm8, 1408(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, 1344(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, 1280(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, 1088(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, 1024(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 960(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 896(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 832(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 768(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, 576(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 512(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512DQBW-SLOW-NEXT:    addq $2632, %rsp # imm = 0xA48
+; AVX512DQBW-SLOW-NEXT:    vzeroupper
+; AVX512DQBW-SLOW-NEXT:    retq
+;
+; AVX512DQBW-FAST-LABEL: store_i64_stride8_vf32:
+; AVX512DQBW-FAST:       # %bb.0:
+; AVX512DQBW-FAST-NEXT:    subq $2632, %rsp # imm = 0xA48
+; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512DQBW-FAST-NEXT:    vmovaps 128(%rdi), %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm23
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rsi), %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovaps 192(%rdx), %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovups %zmm2, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovaps 128(%rdx), %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdx), %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rcx), %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r8), %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r8), %zmm19
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r9), %zmm29
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r9), %zmm25
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r10), %zmm18
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r10), %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rax), %zmm21
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rax), %zmm20
+; AVX512DQBW-FAST-NEXT:    movb $-64, %r11b
+; AVX512DQBW-FAST-NEXT:    kmovd %r11d, %k1
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12]
+; AVX512DQBW-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm12, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm7
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12]
+; AVX512DQBW-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
+; AVX512DQBW-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm14, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm7
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13]
+; AVX512DQBW-FAST-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm13, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-FAST-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm15, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm15, %zmm9
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14]
+; AVX512DQBW-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm7, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15]
+; AVX512DQBW-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm26, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm28 = [7,15,7,15]
+; AVX512DQBW-FAST-NEXT:    # ymm28 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm28, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm12, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12]
+; AVX512DQBW-FAST-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm4, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm14, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm14, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm13, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm15, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm11, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm15, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm26, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm26, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm28, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm4, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm7, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm11, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm13, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm13, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm13, %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm12, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm14, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm15, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm26, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%r10), %zmm28
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rax), %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm12, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm14, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%r8), %zmm16
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%r9), %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm15, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%r10), %zmm23
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rax), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm14, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%r8), %zmm31
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%r9), %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm15, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm26, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm27
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm26, %zmm27
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512DQBW-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm30
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm5, %zmm30
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512DQBW-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10]
+; AVX512DQBW-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm26, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
+; AVX512DQBW-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm0, %zmm18
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, %zmm24
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm5, %zmm24
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm6, %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm21
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm26, %zmm21
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm29
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm5, %zmm29
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm6, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm26, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm5, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm6, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, %zmm20
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm26, %zmm20
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm19
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm19
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm19
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm25
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm25
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm9
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm26, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm28
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm9
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm5, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm15
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm6, %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm12
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm26, %zmm12
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm0, %zmm16
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm5, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm18
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm18
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm26, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm23
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm26
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm0, %zmm31
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm24 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rcx), %xmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0]
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm30, %ymm4
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm24, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm22 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, %zmm7 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rcx), %xmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rdx), %xmm1
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rsi), %xmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdi), %xmm29
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm22, %ymm2
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm30
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rcx), %xmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rdx), %xmm2
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0]
+; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rsi), %xmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rdi), %xmm8
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm19, %ymm4
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm9, %zmm29
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, %zmm15 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm15, %zmm24
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm14, %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm17, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinsertf64x4 $0, %ymm8, %zmm0, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm27, %zmm27
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rcx), %xmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rdx), %xmm14
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rsi), %xmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdi), %xmm19
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm15, %ymm25, %ymm15
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm25
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm6 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm13, %ymm14, %ymm13
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm6, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm10, %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm11, %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinsertf64x4 $0, %ymm13, %zmm0, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinsertf64x4 $0, %ymm14, %zmm0, %zmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rcx), %ymm15
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdx), %ymm17
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rsi), %ymm18
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdi), %ymm19
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm21, %zmm21
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rsi), %ymm15
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdi), %ymm17
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rcx), %ymm18
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdx), %ymm19
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm1, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rcx), %ymm15
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rdx), %ymm17
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rsi), %ymm18
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rdi), %ymm19
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm12, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm16 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm16, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rcx), %ymm12
+; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rdx), %ymm15
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rsi), %ymm16
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdi), %ymm17
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm31 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, 1728(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, 1664(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, 1216(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, 1152(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, 704(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, 640(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, 192(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, 128(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm14, 1984(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm13, 1920(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, 1856(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, 1792(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, 1600(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, 1536(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, 1472(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm8, 1408(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, 1344(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, 1280(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, 1088(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, 1024(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 960(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 896(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 832(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 768(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, 576(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 512(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512DQBW-FAST-NEXT:    addq $2632, %rsp # imm = 0xA48
+; AVX512DQBW-FAST-NEXT:    vzeroupper
+; AVX512DQBW-FAST-NEXT:    retq
   %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64
   %in.vec1 = load <32 x i64>, ptr %in.vecptr1, align 64
   %in.vec2 = load <32 x i64>, ptr %in.vecptr2, align 64
@@ -8927,2033 +13249,8149 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;
-; AVX512F-LABEL: store_i64_stride8_vf64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    subq $5384, %rsp # imm = 0x1508
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm8
-; AVX512F-NEXT:    vmovdqa64 128(%rsi), %zmm16
-; AVX512F-NEXT:    vmovdqa64 64(%rsi), %zmm17
-; AVX512F-NEXT:    vmovdqa64 (%rsi), %zmm3
-; AVX512F-NEXT:    vmovdqa64 64(%rdx), %zmm6
-; AVX512F-NEXT:    vmovdqa64 (%rdx), %zmm9
-; AVX512F-NEXT:    vmovdqa64 64(%rcx), %zmm7
-; AVX512F-NEXT:    vmovdqa64 (%rcx), %zmm11
-; AVX512F-NEXT:    vmovdqa64 (%r8), %zmm2
-; AVX512F-NEXT:    vmovdqa64 64(%r8), %zmm24
-; AVX512F-NEXT:    vmovdqa64 128(%r8), %zmm20
-; AVX512F-NEXT:    vmovdqa64 64(%r9), %zmm25
-; AVX512F-NEXT:    vmovdqa64 128(%r9), %zmm22
-; AVX512F-NEXT:    vmovdqa64 (%r9), %zmm18
-; AVX512F-NEXT:    vmovdqa64 64(%r10), %zmm23
-; AVX512F-NEXT:    vmovdqa64 (%r10), %zmm21
-; AVX512F-NEXT:    vmovdqa64 64(%rax), %zmm26
-; AVX512F-NEXT:    vmovdqa64 (%rax), %zmm19
-; AVX512F-NEXT:    movb $-64, %r11b
-; AVX512F-NEXT:    kmovw %r11d, %k1
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
-; AVX512F-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm14, %zmm0
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,4,12>
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm15, %zmm13
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
-; AVX512F-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm12, %zmm0
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm12, %zmm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,5,13>
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm2, %zmm13
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
-; AVX512F-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm27
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm13, %zmm0
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm13, %zmm10
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,6,14>
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm5, %zmm13
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15]
-; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm0, %zmm10
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7]
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm0, %zmm8
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm27 = <u,u,7,15>
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm27, %zmm9
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm14, %zmm8
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm14, %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm15, %zmm10
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm12, %zmm8
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm12, %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm2, %zmm10
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm11, %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm9
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm5, %zmm9
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm24, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm9
-; AVX512F-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm11, %zmm9
-; AVX512F-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 128(%r10), %zmm9
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqa64 128(%rax), %zmm18
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm27, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm0, %zmm5
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7]
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm14, %zmm4
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm14, %zmm6
-; AVX512F-NEXT:    vmovdqa64 128(%rdx), %zmm4
-; AVX512F-NEXT:    vmovdqa64 128(%rcx), %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm5, %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm12, %zmm5
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7]
-; AVX512F-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm12, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm2, %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm13
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm11, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm10, %zmm6
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm11, %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm27, %zmm4
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 192(%r10), %zmm9
-; AVX512F-NEXT:    vmovdqa64 192(%rax), %zmm20
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm14, %zmm1
-; AVX512F-NEXT:    vmovdqa64 192(%r8), %zmm2
-; AVX512F-NEXT:    vmovdqa64 192(%r9), %zmm10
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm16
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
-; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqa64 192(%rsi), %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
-; AVX512F-NEXT:    vmovdqa64 192(%rdx), %zmm2
-; AVX512F-NEXT:    vmovdqa64 192(%rcx), %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm12, %zmm4
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm11, %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm0, %zmm2
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 256(%r10), %zmm9
-; AVX512F-NEXT:    vmovdqa64 256(%rax), %zmm22
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm14, %zmm1
-; AVX512F-NEXT:    vmovdqa64 256(%r8), %zmm5
-; AVX512F-NEXT:    vmovdqa64 256(%r9), %zmm2
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm17
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm16
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
-; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqa64 256(%rsi), %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
-; AVX512F-NEXT:    vmovdqa64 256(%rdx), %zmm2
-; AVX512F-NEXT:    vmovdqa64 256(%rcx), %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm12, %zmm4
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm11, %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqa64 320(%rsi), %zmm17
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm14, %zmm4
-; AVX512F-NEXT:    vmovdqa64 320(%rdx), %zmm2
-; AVX512F-NEXT:    vmovdqa64 320(%rcx), %zmm9
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm15, %zmm5
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %ymm4, %ymm24
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm13, %zmm6
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm3, %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm27, %zmm2
-; AVX512F-NEXT:    vmovdqa64 384(%rdx), %zmm8
-; AVX512F-NEXT:    vmovdqa64 384(%rcx), %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm15, %zmm9
-; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm13, %zmm9
-; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm5, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm27, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 448(%rdx), %zmm3
-; AVX512F-NEXT:    vmovdqa64 448(%rcx), %zmm4
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm3, %zmm15
-; AVX512F-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm3, %zmm13
-; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm3, %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm27, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 320(%r10), %zmm26
-; AVX512F-NEXT:    vmovdqa64 320(%rax), %zmm16
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm14, %zmm3
-; AVX512F-NEXT:    vmovdqa64 320(%r8), %zmm13
-; AVX512F-NEXT:    vmovdqa64 320(%r9), %zmm15
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm24, %zmm9, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm12, %zmm3
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm12, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm11, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm11, %zmm5
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm3
-; AVX512F-NEXT:    vmovdqa64 384(%rsi), %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm14, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm12, %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm11, %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm0, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 384(%r10), %zmm29
-; AVX512F-NEXT:    vmovdqa64 384(%rax), %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm14, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm4, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 384(%r8), %zmm25
-; AVX512F-NEXT:    vmovdqa64 384(%r9), %zmm12
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm31
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm11, %zmm31
-; AVX512F-NEXT:    vmovdqa64 448(%r10), %zmm28
-; AVX512F-NEXT:    vmovdqa64 448(%rax), %zmm9
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm14, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm5
-; AVX512F-NEXT:    vmovdqa64 448(%rsi), %zmm1
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm5, %zmm14
-; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm5, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 448(%r8), %zmm17
-; AVX512F-NEXT:    vmovdqa64 448(%r9), %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm30
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm11, %zmm30
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm5, %zmm11
-; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm0, %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm24
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm0, %zmm24
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm27
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm0, %zmm27
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
-; AVX512F-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm5, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
-; AVX512F-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm6, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
-; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11]
-; AVX512F-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm1, %zmm21
-; AVX512F-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm21
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm2, %zmm21
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm4
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm5, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm6, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm2, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm23
-; AVX512F-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm19
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm2, %zmm19
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm5, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm6, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm1, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm14
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm5, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm6, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm2, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm1, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm5, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm6, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm2, %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm1, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm5, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm6, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm2, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm1, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm2, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm5, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm6, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm1, %zmm26
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm22
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm5, %zmm22
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm23
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm6, %zmm23
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm2, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm1, %zmm13
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm5, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm6, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm20
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm2, %zmm20
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm1, %zmm29
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm15
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm5, %zmm15
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm16
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm6, %zmm16
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm18
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm2, %zmm18
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7]
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm1, %zmm25
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm5, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm6, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm12
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm2, %zmm12
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm1, %zmm28
-; AVX512F-NEXT:    vpermi2q %zmm8, %zmm17, %zmm5
-; AVX512F-NEXT:    vpermi2q %zmm8, %zmm17, %zmm6
-; AVX512F-NEXT:    vpermi2q %zmm8, %zmm17, %zmm2
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7]
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm1, %zmm17
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm3 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm31, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm24, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm9, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm30, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
-; AVX512F-NEXT:    vmovdqa (%rcx), %ymm0
-; AVX512F-NEXT:    vmovdqa (%rdx), %ymm1
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512F-NEXT:    vmovdqa (%rsi), %ymm4
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm8
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
-; AVX512F-NEXT:    vmovdqa 64(%rsi), %ymm0
-; AVX512F-NEXT:    vmovdqa 64(%rdi), %ymm1
-; AVX512F-NEXT:    vmovdqa 64(%rcx), %ymm3
-; AVX512F-NEXT:    vmovdqa 64(%rdx), %ymm4
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm19, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
-; AVX512F-NEXT:    vmovdqa 128(%rcx), %ymm0
-; AVX512F-NEXT:    vmovdqa 128(%rdx), %ymm1
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512F-NEXT:    vmovdqa 128(%rsi), %ymm8
-; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm9
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm14, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
-; AVX512F-NEXT:    vmovdqa 192(%rcx), %ymm0
-; AVX512F-NEXT:    vmovdqa 192(%rdx), %ymm1
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512F-NEXT:    vmovdqa 192(%rsi), %ymm4
-; AVX512F-NEXT:    vmovdqa 192(%rdi), %ymm8
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm11, %zmm24
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm21
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512F-NEXT:    vmovdqa 256(%rcx), %ymm0
-; AVX512F-NEXT:    vmovdqa 256(%rdx), %ymm1
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512F-NEXT:    vmovdqa 256(%rsi), %ymm4
-; AVX512F-NEXT:    vmovdqa 256(%rdi), %ymm8
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm10, %zmm19
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm10
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
-; AVX512F-NEXT:    vmovdqa 320(%rcx), %ymm1
-; AVX512F-NEXT:    vmovdqa 320(%rdx), %ymm3
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512F-NEXT:    vmovdqa 320(%rsi), %ymm4
-; AVX512F-NEXT:    vmovdqa 320(%rdi), %ymm14
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm13 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm13
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
-; AVX512F-NEXT:    vmovdqa 384(%rcx), %ymm1
-; AVX512F-NEXT:    vmovdqa 384(%rdx), %ymm3
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512F-NEXT:    vmovdqa 384(%rsi), %ymm7
-; AVX512F-NEXT:    vmovdqa 384(%rdi), %ymm9
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm18, %zmm18
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm25 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm25, %zmm25
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm2 {%k1}
-; AVX512F-NEXT:    vmovdqa 448(%rcx), %ymm1
-; AVX512F-NEXT:    vmovdqa 448(%rdx), %ymm3
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512F-NEXT:    vmovdqa 448(%rsi), %ymm7
-; AVX512F-NEXT:    vmovdqa 448(%rdi), %ymm9
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm4
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm17 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm17, %zmm9
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
-; AVX512F-NEXT:    vmovdqa (%rcx), %xmm2
-; AVX512F-NEXT:    vmovdqa (%rdx), %xmm3
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
-; AVX512F-NEXT:    vmovdqa (%rsi), %xmm7
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm12
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm14, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm11
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1]
-; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm7
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT:    vmovdqa 64(%rcx), %xmm2
-; AVX512F-NEXT:    vmovdqa 64(%rdx), %xmm12
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0]
-; AVX512F-NEXT:    vmovdqa 64(%rsi), %xmm14
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %xmm17
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm3, %ymm20, %ymm3
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1]
-; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm12, %ymm2
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT:    vmovdqa 128(%rcx), %xmm12
-; AVX512F-NEXT:    vmovdqa 128(%rdx), %xmm14
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0]
-; AVX512F-NEXT:    vmovdqa64 128(%rsi), %xmm20
-; AVX512F-NEXT:    vmovdqa64 128(%rdi), %xmm26
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm17, %ymm28, %ymm17
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm17, %zmm1, %zmm17
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1]
-; AVX512F-NEXT:    vinserti128 $1, %xmm12, %ymm14, %ymm12
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm12, %zmm1, %zmm12
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT:    vmovdqa 192(%rcx), %xmm14
-; AVX512F-NEXT:    vmovdqa64 192(%rdx), %xmm20
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0]
-; AVX512F-NEXT:    vmovdqa64 192(%rsi), %xmm28
-; AVX512F-NEXT:    vmovdqa64 192(%rdi), %xmm29
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm26, %ymm30, %ymm26
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm26, %zmm1, %zmm26
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm14, %ymm20, %ymm14
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm14, %zmm1, %zmm14
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT:    vmovdqa64 256(%rcx), %xmm20
-; AVX512F-NEXT:    vmovdqa64 256(%rdx), %xmm28
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0]
-; AVX512F-NEXT:    vmovdqa64 256(%rsi), %xmm30
-; AVX512F-NEXT:    vmovdqa 256(%rdi), %xmm0
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm29, %ymm31, %ymm29
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm29, %zmm1, %zmm29
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm1
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512F-NEXT:    vmovdqa64 320(%rcx), %xmm20
-; AVX512F-NEXT:    vmovdqa64 320(%rdx), %xmm28
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0]
-; AVX512F-NEXT:    vmovdqa64 320(%rsi), %xmm31
-; AVX512F-NEXT:    vmovdqa 320(%rdi), %xmm0
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm30, %ymm27, %ymm27
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm27, %zmm22, %zmm22
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm23 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm23, %zmm0
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm15 {%k1}
-; AVX512F-NEXT:    vmovdqa64 384(%rcx), %xmm20
-; AVX512F-NEXT:    vmovdqa64 384(%rdx), %xmm23
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
-; AVX512F-NEXT:    vmovdqa64 384(%rsi), %xmm28
-; AVX512F-NEXT:    vmovdqa64 384(%rdi), %xmm30
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm27, %zmm15, %zmm15
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm16 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm20, %zmm16, %zmm16
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm5 {%k1}
-; AVX512F-NEXT:    vmovdqa64 448(%rcx), %xmm20
-; AVX512F-NEXT:    vmovdqa64 448(%rdx), %xmm23
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
-; AVX512F-NEXT:    vmovdqa64 448(%rsi), %xmm28
-; AVX512F-NEXT:    vmovdqa64 448(%rdi), %xmm30
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm27, %zmm5, %zmm5
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm6 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm20, %zmm6, %zmm6
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm9, 3776(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm4, 3712(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm25, 3264(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm18, 3200(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm13, 2752(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm8, 2688(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm10, 2240(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm19, 2176(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm21, 1728(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm24, 1664(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm4, 1216(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm4, 1152(%rax)
-; AVX512F-NEXT:    vmovups (%rsp), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm4, 704(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm4, 640(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm4, 192(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm4, 128(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm4, 4032(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm4, 3968(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm4, 3904(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm4, 3840(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm6, 3648(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm5, 3584(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm4, 3520(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm4, 3456(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm4, 3392(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm4, 3328(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm16, 3136(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm15, 3072(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm4, 3008(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm4, 2944(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm4, 2880(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm4, 2816(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm0, 2624(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm22, 2560(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 2496(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 2432(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 2368(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 2304(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm1, 2112(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm29, 2048(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 1984(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 1920(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 1856(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 1792(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm14, 1600(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm26, 1536(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 1472(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 1408(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 1344(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 1280(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm12, 1088(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm17, 1024(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 960(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 896(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 832(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 768(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm2, 576(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm3, 512(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm7, 64(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm11, (%rax)
-; AVX512F-NEXT:    addq $5384, %rsp # imm = 0x1508
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
+; AVX512F-ONLY-SLOW-LABEL: store_i64_stride8_vf64:
+; AVX512F-ONLY-SLOW:       # %bb.0:
+; AVX512F-ONLY-SLOW-NEXT:    subq $5384, %rsp # imm = 0x1508
+; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r10), %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rax), %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    movb $-64, %r11b
+; AVX512F-ONLY-SLOW-NEXT:    kmovw %r11d, %k1
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm14, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm15 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm15, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm12, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm13, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm13, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm5, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm27 = [7,15,7,15]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm27 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm27, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm14, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm14, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm12, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm12, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm11, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm5, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm24, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm11, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%r10), %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7]
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm14, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm14, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdx), %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm5, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm12, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm12, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm11, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm10, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm11, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%r10), %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm14, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdx), %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm12, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm11, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%r10), %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rax), %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm14, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%r8), %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%r9), %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rsi), %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdx), %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rcx), %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm12, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm11, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rsi), %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm14, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdx), %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rcx), %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm15, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %ymm4, %ymm24
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm13, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm3, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm27, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdx), %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rcx), %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm15, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm27, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdx), %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rcx), %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm27, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%r10), %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rax), %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm14, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%r8), %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%r9), %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm24, %zmm9, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm12, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm12, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm11, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm11, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rsi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm14, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm11, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%r10), %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rax), %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm4, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%r8), %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%r9), %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm31
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm11, %zmm31
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%r10), %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rax), %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm14, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rsi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm4, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%r8), %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%r9), %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm30
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm11, %zmm30
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm0, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm1, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm5, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm5, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm2, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm5, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm5, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm2, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm1, %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm5, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm6, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm1, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm5, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm6, %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm2, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm1, %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm2, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm1, %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm8, %zmm17, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm8, %zmm17, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm8, %zmm17, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm1, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm31, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm24, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm9, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm30, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rcx), %ymm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %ymm1
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rsi), %ymm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %ymm8
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rsi), %ymm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rcx), %ymm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rdx), %ymm4
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm19, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rcx), %ymm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rdx), %ymm1
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rsi), %ymm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rdi), %ymm9
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm14, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rcx), %ymm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rdx), %ymm1
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rsi), %ymm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm8
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm11, %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 256(%rcx), %ymm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 256(%rdx), %ymm1
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 256(%rsi), %ymm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 256(%rdi), %ymm8
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm10, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 320(%rcx), %ymm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 320(%rdx), %ymm3
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 320(%rsi), %ymm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 320(%rdi), %ymm14
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm13 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 384(%rcx), %ymm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 384(%rdx), %ymm3
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 384(%rsi), %ymm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 384(%rdi), %ymm9
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm18, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm25 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm25, %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm2 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 448(%rcx), %ymm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 448(%rdx), %ymm3
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 448(%rsi), %ymm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 448(%rdi), %ymm9
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm17 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm17, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rsi), %xmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %xmm12
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm14, %ymm1
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rcx), %xmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm12
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %xmm17
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm3, %ymm20, %ymm3
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm12, %ymm2
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rcx), %xmm12
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm14
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rsi), %xmm20
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %xmm26
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm17, %ymm28, %ymm17
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm17, %zmm1, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm12, %ymm14, %ymm12
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm1, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rcx), %xmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdx), %xmm20
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %xmm28
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %xmm29
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm26, %ymm30, %ymm26
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm26, %zmm1, %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm14, %ymm20, %ymm14
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm14, %zmm1, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rcx), %xmm20
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdx), %xmm28
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rsi), %xmm30
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 256(%rdi), %xmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm29, %ymm31, %ymm29
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm29, %zmm1, %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rcx), %xmm20
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdx), %xmm28
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rsi), %xmm31
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 320(%rdi), %xmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm30, %ymm27, %ymm27
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm22, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm23 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm23, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm15 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rcx), %xmm20
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdx), %xmm23
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rsi), %xmm28
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdi), %xmm30
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm15, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm16 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm20, %zmm16, %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rcx), %xmm20
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdx), %xmm23
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rsi), %xmm28
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdi), %xmm30
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm5, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm6 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm20, %zmm6, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 3776(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 3712(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, 3264(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, 3200(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, 2752(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, 2688(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 2240(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, 2176(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, 1728(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, 1664(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 1216(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 1152(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups (%rsp), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 704(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 640(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 128(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 4032(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3968(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3904(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3840(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, 3648(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 3584(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3520(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3456(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3392(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3328(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, 3136(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, 3072(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3008(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 2944(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 2880(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 2816(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 2624(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, 2560(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 2496(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 2432(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 2368(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 2304(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 2112(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, 2048(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1984(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1920(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1856(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1792(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, 1600(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, 1536(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1472(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1408(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1344(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1280(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, 1088(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, 1024(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 960(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 896(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 832(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 768(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 576(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 512(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, (%rax)
+; AVX512F-ONLY-SLOW-NEXT:    addq $5384, %rsp # imm = 0x1508
+; AVX512F-ONLY-SLOW-NEXT:    vzeroupper
+; AVX512F-ONLY-SLOW-NEXT:    retq
 ;
-; AVX512BW-LABEL: store_i64_stride8_vf64:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    subq $5384, %rsp # imm = 0x1508
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm8
-; AVX512BW-NEXT:    vmovdqa64 128(%rsi), %zmm16
-; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm17
-; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm3
-; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm6
-; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm9
-; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm7
-; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm11
-; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 64(%r8), %zmm24
-; AVX512BW-NEXT:    vmovdqa64 128(%r8), %zmm20
-; AVX512BW-NEXT:    vmovdqa64 64(%r9), %zmm25
-; AVX512BW-NEXT:    vmovdqa64 128(%r9), %zmm22
-; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm18
-; AVX512BW-NEXT:    vmovdqa64 64(%r10), %zmm23
-; AVX512BW-NEXT:    vmovdqa64 (%r10), %zmm21
-; AVX512BW-NEXT:    vmovdqa64 64(%rax), %zmm26
-; AVX512BW-NEXT:    vmovdqa64 (%rax), %zmm19
-; AVX512BW-NEXT:    movb $-64, %r11b
-; AVX512BW-NEXT:    kmovd %r11d, %k1
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm14, %zmm0
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,4,12>
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm15, %zmm13
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm12, %zmm0
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,5,13>
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm13
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm27
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm13, %zmm0
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm13, %zmm10
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,6,14>
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm5, %zmm13
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm10
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7]
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm27 = <u,u,7,15>
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm27, %zmm9
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm14, %zmm8
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm14, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm10
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm12, %zmm8
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm12, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm10
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm11, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm5, %zmm9
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm24, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm9
-; AVX512BW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm11, %zmm9
-; AVX512BW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 128(%r10), %zmm9
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 128(%rax), %zmm18
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm5
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm14, %zmm4
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm14, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 128(%rdx), %zmm4
-; AVX512BW-NEXT:    vmovdqa64 128(%rcx), %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm5, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm12, %zmm5
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7]
-; AVX512BW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm12, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm13
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm11, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm10, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm11, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 192(%r10), %zmm9
-; AVX512BW-NEXT:    vmovdqa64 192(%rax), %zmm20
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm14, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 192(%r8), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 192(%r9), %zmm10
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm16
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqa64 192(%rsi), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 192(%rdx), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 192(%rcx), %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm12, %zmm4
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm11, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 256(%r10), %zmm9
-; AVX512BW-NEXT:    vmovdqa64 256(%rax), %zmm22
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm14, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 256(%r8), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 256(%r9), %zmm2
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm17
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm16
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqa64 256(%rsi), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 256(%rdx), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 256(%rcx), %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm12, %zmm4
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm11, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqa64 320(%rsi), %zmm17
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm14, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 320(%rdx), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 320(%rcx), %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm15, %zmm5
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %ymm4, %ymm24
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm13, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm3, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm27, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 384(%rdx), %zmm8
-; AVX512BW-NEXT:    vmovdqa64 384(%rcx), %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm15, %zmm9
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm9
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm27, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 448(%rdx), %zmm3
-; AVX512BW-NEXT:    vmovdqa64 448(%rcx), %zmm4
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm15
-; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm27, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 320(%r10), %zmm26
-; AVX512BW-NEXT:    vmovdqa64 320(%rax), %zmm16
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm14, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 320(%r8), %zmm13
-; AVX512BW-NEXT:    vmovdqa64 320(%r9), %zmm15
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm24, %zmm9, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm12, %zmm3
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm12, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm11, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm11, %zmm5
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqa64 384(%rsi), %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm14, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm11, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 384(%r10), %zmm29
-; AVX512BW-NEXT:    vmovdqa64 384(%rax), %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm4, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 384(%r8), %zmm25
-; AVX512BW-NEXT:    vmovdqa64 384(%r9), %zmm12
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm31
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm11, %zmm31
-; AVX512BW-NEXT:    vmovdqa64 448(%r10), %zmm28
-; AVX512BW-NEXT:    vmovdqa64 448(%rax), %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm14, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 448(%rsi), %zmm1
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm14
-; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 448(%r8), %zmm17
-; AVX512BW-NEXT:    vmovdqa64 448(%r9), %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm30
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm11, %zmm30
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm11
-; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm24
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm24
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm27
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm0, %zmm27
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
-; AVX512BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm5, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm6, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm1, %zmm21
-; AVX512BW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm21
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm21
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm23
-; AVX512BW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm19
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm19
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm5, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm6, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm14
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm5, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm6, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm2, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm5, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm6, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm5, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm6, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm2, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm5, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm6, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm1, %zmm26
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm22
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm5, %zmm22
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm23
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm6, %zmm23
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm1, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm5, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm6, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm20
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm20
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm29
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm5, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm16
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm6, %zmm16
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm18
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm2, %zmm18
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7]
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm1, %zmm25
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm5, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm6, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm12
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm2, %zmm12
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm1, %zmm28
-; AVX512BW-NEXT:    vpermi2q %zmm8, %zmm17, %zmm5
-; AVX512BW-NEXT:    vpermi2q %zmm8, %zmm17, %zmm6
-; AVX512BW-NEXT:    vpermi2q %zmm8, %zmm17, %zmm2
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7]
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm1, %zmm17
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm3 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm31, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm24, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm9, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm30, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
-; AVX512BW-NEXT:    vmovdqa (%rcx), %ymm0
-; AVX512BW-NEXT:    vmovdqa (%rdx), %ymm1
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm4
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm8
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
-; AVX512BW-NEXT:    vmovdqa 64(%rsi), %ymm0
-; AVX512BW-NEXT:    vmovdqa 64(%rdi), %ymm1
-; AVX512BW-NEXT:    vmovdqa 64(%rcx), %ymm3
-; AVX512BW-NEXT:    vmovdqa 64(%rdx), %ymm4
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm19, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
-; AVX512BW-NEXT:    vmovdqa 128(%rcx), %ymm0
-; AVX512BW-NEXT:    vmovdqa 128(%rdx), %ymm1
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512BW-NEXT:    vmovdqa 128(%rsi), %ymm8
-; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm9
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm14, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
-; AVX512BW-NEXT:    vmovdqa 192(%rcx), %ymm0
-; AVX512BW-NEXT:    vmovdqa 192(%rdx), %ymm1
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512BW-NEXT:    vmovdqa 192(%rsi), %ymm4
-; AVX512BW-NEXT:    vmovdqa 192(%rdi), %ymm8
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm11, %zmm24
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm21
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512BW-NEXT:    vmovdqa 256(%rcx), %ymm0
-; AVX512BW-NEXT:    vmovdqa 256(%rdx), %ymm1
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512BW-NEXT:    vmovdqa 256(%rsi), %ymm4
-; AVX512BW-NEXT:    vmovdqa 256(%rdi), %ymm8
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm10, %zmm19
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
-; AVX512BW-NEXT:    vmovdqa 320(%rcx), %ymm1
-; AVX512BW-NEXT:    vmovdqa 320(%rdx), %ymm3
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512BW-NEXT:    vmovdqa 320(%rsi), %ymm4
-; AVX512BW-NEXT:    vmovdqa 320(%rdi), %ymm14
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm13 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
-; AVX512BW-NEXT:    vmovdqa 384(%rcx), %ymm1
-; AVX512BW-NEXT:    vmovdqa 384(%rdx), %ymm3
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512BW-NEXT:    vmovdqa 384(%rsi), %ymm7
-; AVX512BW-NEXT:    vmovdqa 384(%rdi), %ymm9
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm18, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm25 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm25, %zmm25
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa 448(%rcx), %ymm1
-; AVX512BW-NEXT:    vmovdqa 448(%rdx), %ymm3
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512BW-NEXT:    vmovdqa 448(%rsi), %ymm7
-; AVX512BW-NEXT:    vmovdqa 448(%rdi), %ymm9
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm4
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm17 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm17, %zmm9
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
-; AVX512BW-NEXT:    vmovdqa (%rcx), %xmm2
-; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm3
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
-; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm7
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm12
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm14, %ymm1
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm11
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa 64(%rcx), %xmm2
-; AVX512BW-NEXT:    vmovdqa 64(%rdx), %xmm12
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0]
-; AVX512BW-NEXT:    vmovdqa 64(%rsi), %xmm14
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %xmm17
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm3, %ymm20, %ymm3
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm12, %ymm2
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa 128(%rcx), %xmm12
-; AVX512BW-NEXT:    vmovdqa 128(%rdx), %xmm14
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0]
-; AVX512BW-NEXT:    vmovdqa64 128(%rsi), %xmm20
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %xmm26
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm17, %ymm28, %ymm17
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm17, %zmm1, %zmm17
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm12, %ymm14, %ymm12
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm12, %zmm1, %zmm12
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa 192(%rcx), %xmm14
-; AVX512BW-NEXT:    vmovdqa64 192(%rdx), %xmm20
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0]
-; AVX512BW-NEXT:    vmovdqa64 192(%rsi), %xmm28
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %xmm29
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm26, %ymm30, %ymm26
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm26, %zmm1, %zmm26
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm14, %ymm20, %ymm14
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm14, %zmm1, %zmm14
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 256(%rcx), %xmm20
-; AVX512BW-NEXT:    vmovdqa64 256(%rdx), %xmm28
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0]
-; AVX512BW-NEXT:    vmovdqa64 256(%rsi), %xmm30
-; AVX512BW-NEXT:    vmovdqa 256(%rdi), %xmm0
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm29, %ymm31, %ymm29
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm29, %zmm1, %zmm29
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 320(%rcx), %xmm20
-; AVX512BW-NEXT:    vmovdqa64 320(%rdx), %xmm28
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0]
-; AVX512BW-NEXT:    vmovdqa64 320(%rsi), %xmm31
-; AVX512BW-NEXT:    vmovdqa 320(%rdi), %xmm0
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm30, %ymm27, %ymm27
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm27, %zmm22, %zmm22
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm23 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm23, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm15 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 384(%rcx), %xmm20
-; AVX512BW-NEXT:    vmovdqa64 384(%rdx), %xmm23
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
-; AVX512BW-NEXT:    vmovdqa64 384(%rsi), %xmm28
-; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %xmm30
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm27, %zmm15, %zmm15
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm16 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm20, %zmm16, %zmm16
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 448(%rcx), %xmm20
-; AVX512BW-NEXT:    vmovdqa64 448(%rdx), %xmm23
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
-; AVX512BW-NEXT:    vmovdqa64 448(%rsi), %xmm28
-; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %xmm30
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm27, %zmm5, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm6 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm20, %zmm6, %zmm6
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, 3776(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, 3712(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, 3264(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, 3200(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, 2752(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, 2688(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, 2240(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, 2176(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, 1728(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, 1664(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm4, 1216(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm4, 1152(%rax)
-; AVX512BW-NEXT:    vmovups (%rsp), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm4, 704(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm4, 640(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm4, 192(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm4, 128(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm4, 4032(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm4, 3968(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm4, 3904(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm4, 3840(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, 3648(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, 3584(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm4, 3520(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm4, 3456(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm4, 3392(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm4, 3328(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, 3136(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, 3072(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm4, 3008(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm4, 2944(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm4, 2880(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm4, 2816(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, 2624(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, 2560(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 2496(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 2432(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 2368(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 2304(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, 2112(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, 2048(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 1984(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 1920(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 1856(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 1792(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, 1600(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, 1536(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 1472(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 1408(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 1344(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 1280(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, 1088(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, 1024(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 960(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 896(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 832(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 768(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, 576(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, 512(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, (%rax)
-; AVX512BW-NEXT:    addq $5384, %rsp # imm = 0x1508
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX512F-ONLY-FAST-LABEL: store_i64_stride8_vf64:
+; AVX512F-ONLY-FAST:       # %bb.0:
+; AVX512F-ONLY-FAST-NEXT:    subq $5384, %rsp # imm = 0x1508
+; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%r8), %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%r9), %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r10), %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r10), %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rax), %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rax), %zmm19
+; AVX512F-ONLY-FAST-NEXT:    movb $-64, %r11b
+; AVX512F-ONLY-FAST-NEXT:    kmovw %r11d, %k1
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
+; AVX512F-ONLY-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm14, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12]
+; AVX512F-ONLY-FAST-NEXT:    # ymm15 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm15, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
+; AVX512F-ONLY-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13]
+; AVX512F-ONLY-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm2, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm13, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm13, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm5, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15]
+; AVX512F-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm0, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm0, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm27 = [7,15,7,15]
+; AVX512F-ONLY-FAST-NEXT:    # ymm27 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm27, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm14, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm14, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm12, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm12, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm2, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm11, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm5, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm24, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm11, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%r10), %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rax), %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm14, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm14, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rdx), %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm5, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm12, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm12, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm2, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm11, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm10, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm11, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%r10), %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rax), %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm14, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%r8), %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%r9), %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdx), %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm12, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm11, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%r10), %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rax), %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm14, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%r8), %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%r9), %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rsi), %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rdx), %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rcx), %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm12, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm11, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rsi), %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm14, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rdx), %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rcx), %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm15, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm4, %ymm24
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm13, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm3, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm27, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rdx), %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rcx), %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm15, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm13, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm27, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rdx), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rcx), %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm27, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%r10), %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rax), %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm14, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%r8), %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%r9), %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm24, %zmm9, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm12, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm12, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm11, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm11, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rsi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm14, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm11, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%r10), %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rax), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm4, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%r8), %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%r9), %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm31
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm11, %zmm31
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%r10), %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rax), %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm14, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rsi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm5, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%r8), %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%r9), %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm11, %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm5, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm0, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm0, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512F-ONLY-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512F-ONLY-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512F-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11]
+; AVX512F-ONLY-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm1, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm2, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm1, %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm5, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm6, %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm2, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm1, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm1, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm5, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm6, %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm2, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm1, %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm2, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm1, %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm8, %zmm17, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm8, %zmm17, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm8, %zmm17, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm1, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm31, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm24, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm9, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm30, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rcx), %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdx), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rsi), %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdi), %ymm8
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rdi), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rcx), %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm19, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rcx), %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rdx), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rsi), %ymm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rdi), %ymm9
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm14, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rcx), %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rdx), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rsi), %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rdi), %ymm8
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm11, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 256(%rcx), %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 256(%rdx), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 256(%rsi), %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 256(%rdi), %ymm8
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm10, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 320(%rcx), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 320(%rdx), %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 320(%rsi), %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 320(%rdi), %ymm14
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm13 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 384(%rcx), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 384(%rdx), %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 384(%rsi), %ymm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 384(%rdi), %ymm9
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm18, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm25 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm25, %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm2 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 448(%rcx), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 448(%rdx), %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 448(%rsi), %ymm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 448(%rdi), %ymm9
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm17 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm17, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm3
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rsi), %xmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdi), %xmm12
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm14, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rcx), %xmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %xmm12
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %xmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %xmm17
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm3, %ymm20, %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm12, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rcx), %xmm12
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rdx), %xmm14
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rsi), %xmm20
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %xmm26
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm17, %ymm28, %ymm17
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm17, %zmm1, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm12, %ymm14, %ymm12
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm1, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rcx), %xmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdx), %xmm20
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %xmm28
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %xmm29
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm26, %ymm30, %ymm26
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm26, %zmm1, %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm14, %ymm20, %ymm14
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm14, %zmm1, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rcx), %xmm20
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rdx), %xmm28
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rsi), %xmm30
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 256(%rdi), %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm29, %ymm31, %ymm29
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm29, %zmm1, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rcx), %xmm20
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rdx), %xmm28
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rsi), %xmm31
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 320(%rdi), %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm30, %ymm27, %ymm27
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm27, %zmm22, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm23 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm23, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm15 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rcx), %xmm20
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rdx), %xmm23
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rsi), %xmm28
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rdi), %xmm30
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm27, %zmm15, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm16 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm20, %zmm16, %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rcx), %xmm20
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rdx), %xmm23
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rsi), %xmm28
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rdi), %xmm30
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm27, %zmm5, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm6 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm20, %zmm6, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 3776(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 3712(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, 3264(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, 3200(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, 2752(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, 2688(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 2240(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, 2176(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, 1728(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, 1664(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 1216(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 1152(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups (%rsp), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 704(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 640(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 4032(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 3968(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 3904(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 3840(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 3648(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, 3584(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 3520(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 3456(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 3392(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 3328(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, 3136(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, 3072(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 3008(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 2944(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 2880(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 2816(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 2624(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, 2560(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 2496(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 2432(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 2368(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 2304(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 2112(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, 2048(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 1984(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 1920(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 1856(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 1792(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, 1600(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, 1536(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 1472(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 1408(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 1344(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 1280(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, 1088(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, 1024(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 960(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 896(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 832(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 768(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 576(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 512(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, (%rax)
+; AVX512F-ONLY-FAST-NEXT:    addq $5384, %rsp # imm = 0x1508
+; AVX512F-ONLY-FAST-NEXT:    vzeroupper
+; AVX512F-ONLY-FAST-NEXT:    retq
+;
+; AVX512DQ-SLOW-LABEL: store_i64_stride8_vf64:
+; AVX512DQ-SLOW:       # %bb.0:
+; AVX512DQ-SLOW-NEXT:    subq $5384, %rsp # imm = 0x1508
+; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm16
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r8), %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm24
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm20
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm25
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm22
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r9), %zmm18
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm23
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r10), %zmm21
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm26
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rax), %zmm19
+; AVX512DQ-SLOW-NEXT:    movb $-64, %r11b
+; AVX512DQ-SLOW-NEXT:    kmovw %r11d, %k1
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
+; AVX512DQ-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm14, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12]
+; AVX512DQ-SLOW-NEXT:    # ymm15 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm15, %zmm13
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
+; AVX512DQ-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm12, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm0
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13]
+; AVX512DQ-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm13
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm27
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm13, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm13, %zmm10
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14]
+; AVX512DQ-SLOW-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm5, %zmm13
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15]
+; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm8
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm27 = [7,15,7,15]
+; AVX512DQ-SLOW-NEXT:    # ymm27 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm27, %zmm9
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm26, %zmm14, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm14, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm26, %zmm12, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm12, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm11, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm5, %zmm9
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm24, (%rsp) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm11, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%r10), %zmm9
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm18
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7]
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm14, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm14, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rdx), %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm5, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm12, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm12, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm13
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm11, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm10, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm22, %zmm11, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%r10), %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm20
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm14, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm10
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdx), %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm12, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm11, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%r10), %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rax), %zmm22
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm22, %zmm14, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%r8), %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%r9), %zmm2
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rsi), %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rdx), %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rcx), %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm22, %zmm12, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm11, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rsi), %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm14, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rdx), %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rcx), %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm15, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %ymm4, %ymm24
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm13, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm3, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm27, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rdx), %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rcx), %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm15, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm27, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rdx), %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rcx), %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm27, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%r10), %zmm26
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rax), %zmm16
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm14, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%r8), %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%r9), %zmm15
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm24, %zmm9, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm12, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm12, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm11, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm11, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rsi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm14, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm11, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%r10), %zmm29
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rax), %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm4, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%r8), %zmm25
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%r9), %zmm12
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm31
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm11, %zmm31
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%r10), %zmm28
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rax), %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm14, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rsi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%r8), %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%r9), %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm30
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm11, %zmm30
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm24
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm24
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm27
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm0, %zmm27
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512DQ-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512DQ-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512DQ-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11]
+; AVX512DQ-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm1, %zmm21
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm21
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm23
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm19
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm19
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm5, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm5, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm2, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm5, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm22, %zmm5, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm22, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm22, %zmm2, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm22, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm1, %zmm26
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm22
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm5, %zmm22
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm23
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm6, %zmm23
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm1, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm20
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm20
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm29
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm15
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm5, %zmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm16
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm6, %zmm16
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm18
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm2, %zmm18
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm1, %zmm25
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm12
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm2, %zmm12
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm1, %zmm28
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm8, %zmm17, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm8, %zmm17, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm8, %zmm17, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm1, %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm31, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm24, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm9, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm30, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rcx), %ymm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdx), %ymm1
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rsi), %ymm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdi), %ymm8
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rsi), %ymm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rcx), %ymm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rdx), %ymm4
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm19, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm9 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rcx), %ymm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rdx), %ymm1
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rsi), %ymm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rdi), %ymm9
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm14, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rcx), %ymm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rdx), %ymm1
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rsi), %ymm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm8
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm11, %zmm24
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm21
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 256(%rcx), %ymm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa 256(%rdx), %ymm1
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQ-SLOW-NEXT:    vmovdqa 256(%rsi), %ymm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa 256(%rdi), %ymm8
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm10, %zmm19
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm10
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 320(%rcx), %ymm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa 320(%rdx), %ymm3
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512DQ-SLOW-NEXT:    vmovdqa 320(%rsi), %ymm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa 320(%rdi), %ymm14
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm13 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 384(%rcx), %ymm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa 384(%rdx), %ymm3
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512DQ-SLOW-NEXT:    vmovdqa 384(%rsi), %ymm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa 384(%rdi), %ymm9
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm18, %zmm18
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm25 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm25, %zmm25
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm2 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 448(%rcx), %ymm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa 448(%rdx), %ymm3
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512DQ-SLOW-NEXT:    vmovdqa 448(%rsi), %ymm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa 448(%rdi), %ymm9
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm17 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm17, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdx), %xmm3
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rsi), %xmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdi), %xmm12
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm14, %ymm1
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rcx), %xmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm12
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0]
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdi), %xmm17
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm3, %ymm20, %ymm3
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm12, %ymm2
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rcx), %xmm12
+; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm14
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rsi), %xmm20
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rdi), %xmm26
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm17, %ymm28, %ymm17
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm17, %zmm1, %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm12, %ymm14, %ymm12
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm1, %zmm12
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rcx), %xmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdx), %xmm20
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rsi), %xmm28
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdi), %xmm29
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm26, %ymm30, %ymm26
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm26, %zmm1, %zmm26
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm14, %ymm20, %ymm14
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm14, %zmm1, %zmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rcx), %xmm20
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rdx), %xmm28
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rsi), %xmm30
+; AVX512DQ-SLOW-NEXT:    vmovdqa 256(%rdi), %xmm0
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm29, %ymm31, %ymm29
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm29, %zmm1, %zmm29
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rcx), %xmm20
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rdx), %xmm28
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rsi), %xmm31
+; AVX512DQ-SLOW-NEXT:    vmovdqa 320(%rdi), %xmm0
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm30, %ymm27, %ymm27
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm22, %zmm22
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm23 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm23, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm15 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rcx), %xmm20
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rdx), %xmm23
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rsi), %xmm28
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rdi), %xmm30
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm15, %zmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm16 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm20, %zmm16, %zmm16
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rcx), %xmm20
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rdx), %xmm23
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rsi), %xmm28
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rdi), %xmm30
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm5, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm6 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm20, %zmm6, %zmm6
+; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, 3776(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, 3712(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, 3264(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, 3200(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, 2752(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, 2688(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, 2240(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, 2176(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, 1728(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, 1664(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 1216(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 1152(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups (%rsp), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 704(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 640(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 192(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 128(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 4032(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 3968(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 3904(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 3840(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, 3648(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, 3584(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 3520(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 3456(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 3392(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 3328(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, 3136(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, 3072(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 3008(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 2944(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 2880(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 2816(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, 2624(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, 2560(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 2496(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 2432(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 2368(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 2304(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, 2112(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, 2048(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 1984(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 1920(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 1856(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 1792(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, 1600(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, 1536(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 1472(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 1408(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 1344(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 1280(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, 1088(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, 1024(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 960(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 896(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 832(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 768(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, 576(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, 512(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, 64(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, (%rax)
+; AVX512DQ-SLOW-NEXT:    addq $5384, %rsp # imm = 0x1508
+; AVX512DQ-SLOW-NEXT:    vzeroupper
+; AVX512DQ-SLOW-NEXT:    retq
+;
+; AVX512DQ-FAST-LABEL: store_i64_stride8_vf64:
+; AVX512DQ-FAST:       # %bb.0:
+; AVX512DQ-FAST-NEXT:    subq $5384, %rsp # imm = 0x1508
+; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdi), %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm16
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rsi), %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdx), %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rcx), %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r8), %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r8), %zmm24
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%r8), %zmm20
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r9), %zmm25
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%r9), %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r9), %zmm18
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r10), %zmm23
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r10), %zmm21
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rax), %zmm26
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rax), %zmm19
+; AVX512DQ-FAST-NEXT:    movb $-64, %r11b
+; AVX512DQ-FAST-NEXT:    kmovw %r11d, %k1
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
+; AVX512DQ-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm14, %zmm0
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12]
+; AVX512DQ-FAST-NEXT:    # ymm15 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm15, %zmm13
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
+; AVX512DQ-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm12, %zmm0
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm0
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13]
+; AVX512DQ-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm2, %zmm13
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm27
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm13, %zmm0
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm10
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm13, %zmm10
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14]
+; AVX512DQ-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm5, %zmm13
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15]
+; AVX512DQ-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, %zmm10
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm0, %zmm10
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm0, %zmm8
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm27 = [7,15,7,15]
+; AVX512DQ-FAST-NEXT:    # ymm27 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm27, %zmm9
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm14, %zmm8
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm14, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm10
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm10
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm12, %zmm8
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm12, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm10
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm2, %zmm10
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm11, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm5, %zmm9
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm24, (%rsp) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm11, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%r10), %zmm9
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rax), %zmm18
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7]
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm14, %zmm4
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm14, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rdx), %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm5, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm12, %zmm5
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm12, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm2, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm13
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm11, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm10, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm11, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm4
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%r10), %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rax), %zmm20
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm14, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%r8), %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%r9), %zmm10
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rdx), %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm12, %zmm4
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm11, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%r10), %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rax), %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm14, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%r8), %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%r9), %zmm2
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rsi), %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rdx), %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rcx), %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm12, %zmm4
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm11, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rsi), %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm14, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rdx), %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rcx), %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm15, %zmm5
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm4, %ymm24
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm13, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm3, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm27, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rdx), %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rcx), %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm15, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm13, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm27, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rdx), %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rcx), %zmm4
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm15
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm27, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%r10), %zmm26
+; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rax), %zmm16
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm14, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%r8), %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%r9), %zmm15
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm24, %zmm9, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm12, %zmm3
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm12, %zmm3
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm11, %zmm3
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm11, %zmm5
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rsi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm14, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm11, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%r10), %zmm29
+; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rax), %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm4, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%r8), %zmm25
+; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%r9), %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, %zmm31
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm11, %zmm31
+; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%r10), %zmm28
+; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rax), %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm14, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rsi), %zmm1
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm5, %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%r8), %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%r9), %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm30
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm11, %zmm30
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm5, %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, %zmm24
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm0, %zmm24
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm27
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm0, %zmm27
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512DQ-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512DQ-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512DQ-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11]
+; AVX512DQ-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm1, %zmm21
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm21
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm2, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm23
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm19
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm19
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm2, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm2, %zmm11
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm2, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm10
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm1, %zmm26
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm22
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm5, %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm23
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm6, %zmm23
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm2, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm1, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, %zmm20
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm20
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm1, %zmm29
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, %zmm15
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm5, %zmm15
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, %zmm16
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm6, %zmm16
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, %zmm18
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm2, %zmm18
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm1, %zmm25
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm12
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm2, %zmm12
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm1, %zmm28
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm8, %zmm17, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm8, %zmm17, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm8, %zmm17, %zmm2
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm1, %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm31, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm24, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm9, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm30, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rcx), %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdx), %ymm1
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rsi), %ymm4
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %ymm8
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rsi), %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rdi), %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rcx), %ymm3
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rdx), %ymm4
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm19, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 (%rsp), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rcx), %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rdx), %ymm1
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rsi), %ymm8
+; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rdi), %ymm9
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm14, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rcx), %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rdx), %ymm1
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rsi), %ymm4
+; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rdi), %ymm8
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm11, %zmm24
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm21
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 256(%rcx), %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqa 256(%rdx), %ymm1
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQ-FAST-NEXT:    vmovdqa 256(%rsi), %ymm4
+; AVX512DQ-FAST-NEXT:    vmovdqa 256(%rdi), %ymm8
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm10, %zmm19
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm10
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 320(%rcx), %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqa 320(%rdx), %ymm3
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512DQ-FAST-NEXT:    vmovdqa 320(%rsi), %ymm4
+; AVX512DQ-FAST-NEXT:    vmovdqa 320(%rdi), %ymm14
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, %zmm13 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 384(%rcx), %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqa 384(%rdx), %ymm3
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512DQ-FAST-NEXT:    vmovdqa 384(%rsi), %ymm7
+; AVX512DQ-FAST-NEXT:    vmovdqa 384(%rdi), %ymm9
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm18, %zmm18
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, %zmm25 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm25, %zmm25
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm2 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 448(%rcx), %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqa 448(%rdx), %ymm3
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512DQ-FAST-NEXT:    vmovdqa 448(%rsi), %ymm7
+; AVX512DQ-FAST-NEXT:    vmovdqa 448(%rdi), %ymm9
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm4
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm17 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm17, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdx), %xmm3
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rsi), %xmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %xmm12
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm14, %ymm1
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rcx), %xmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rdx), %xmm12
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0]
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rsi), %xmm14
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %xmm17
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm3, %ymm20, %ymm3
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm12, %ymm2
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rcx), %xmm12
+; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rdx), %xmm14
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rsi), %xmm20
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rdi), %xmm26
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm17, %ymm28, %ymm17
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm17, %zmm1, %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm12, %ymm14, %ymm12
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm1, %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rcx), %xmm14
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rdx), %xmm20
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rsi), %xmm28
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rdi), %xmm29
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm26, %ymm30, %ymm26
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm26, %zmm1, %zmm26
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm14, %ymm20, %ymm14
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm14, %zmm1, %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rcx), %xmm20
+; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rdx), %xmm28
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rsi), %xmm30
+; AVX512DQ-FAST-NEXT:    vmovdqa 256(%rdi), %xmm0
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm29, %ymm31, %ymm29
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm29, %zmm1, %zmm29
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rcx), %xmm20
+; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rdx), %xmm28
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rsi), %xmm31
+; AVX512DQ-FAST-NEXT:    vmovdqa 320(%rdi), %xmm0
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm30, %ymm27, %ymm27
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm27, %zmm22, %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, %zmm23 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm23, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm15 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rcx), %xmm20
+; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rdx), %xmm23
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rsi), %xmm28
+; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rdi), %xmm30
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm27, %zmm15, %zmm15
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, %zmm16 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm20, %zmm16, %zmm16
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rcx), %xmm20
+; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rdx), %xmm23
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rsi), %xmm28
+; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rdi), %xmm30
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm27, %zmm5, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, %zmm6 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm20, %zmm6, %zmm6
+; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, 3776(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, 3712(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, 3264(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, 3200(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, 2752(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, 2688(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, 2240(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, 2176(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, 1728(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, 1664(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 1216(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 1152(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups (%rsp), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 704(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 640(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 192(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 128(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 4032(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 3968(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 3904(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 3840(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, 3648(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, 3584(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 3520(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 3456(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 3392(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 3328(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, 3136(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm15, 3072(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 3008(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 2944(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 2880(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 2816(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, 2624(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, 2560(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 2496(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 2432(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 2368(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 2304(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, 2112(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, 2048(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 1984(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 1920(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 1856(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 1792(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, 1600(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, 1536(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 1472(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 1408(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 1344(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 1280(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, 1088(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, 1024(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 960(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 896(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 832(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 768(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, 576(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, 512(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, 64(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, (%rax)
+; AVX512DQ-FAST-NEXT:    addq $5384, %rsp # imm = 0x1508
+; AVX512DQ-FAST-NEXT:    vzeroupper
+; AVX512DQ-FAST-NEXT:    retq
+;
+; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride8_vf64:
+; AVX512BW-ONLY-SLOW:       # %bb.0:
+; AVX512BW-ONLY-SLOW-NEXT:    subq $5384, %rsp # imm = 0x1508
+; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r10), %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rax), %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    movb $-64, %r11b
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %r11d, %k1
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm14, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm15 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm15, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm12, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm13, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm13, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm5, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm27 = [7,15,7,15]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm27 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm27, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm14, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm14, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm12, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm12, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm11, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm5, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm24, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm11, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%r10), %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm14, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm14, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdx), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm5, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm12, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm12, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm11, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm10, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm11, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%r10), %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm14, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdx), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm12, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm11, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%r10), %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rax), %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm14, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%r8), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%r9), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rsi), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdx), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rcx), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm12, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm11, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rsi), %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm14, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdx), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rcx), %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm15, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %ymm4, %ymm24
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm13, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm3, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm27, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdx), %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rcx), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm15, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm27, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdx), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rcx), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm27, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%r10), %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rax), %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm14, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%r8), %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%r9), %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm24, %zmm9, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm12, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm12, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm11, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm11, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rsi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm14, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm11, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%r10), %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rax), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm4, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%r8), %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%r9), %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm11, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%r10), %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rax), %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm14, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rsi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm4, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%r8), %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%r9), %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm11, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm0, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm1, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm2, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm2, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm1, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm5, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm6, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm1, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm5, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm6, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm2, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm1, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm2, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm1, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm8, %zmm17, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm8, %zmm17, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm8, %zmm17, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm1, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm31, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm24, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm9, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm30, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rcx), %ymm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %ymm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rsi), %ymm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %ymm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rsi), %ymm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rcx), %ymm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rdx), %ymm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm19, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rcx), %ymm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rdx), %ymm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rsi), %ymm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rdi), %ymm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm14, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rcx), %ymm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rdx), %ymm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rsi), %ymm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm11, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 256(%rcx), %ymm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 256(%rdx), %ymm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 256(%rsi), %ymm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 256(%rdi), %ymm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm10, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 320(%rcx), %ymm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 320(%rdx), %ymm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 320(%rsi), %ymm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 320(%rdi), %ymm14
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm13 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 384(%rcx), %ymm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 384(%rdx), %ymm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 384(%rsi), %ymm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 384(%rdi), %ymm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm18, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm25 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm25, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm2 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 448(%rcx), %ymm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 448(%rdx), %ymm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 448(%rsi), %ymm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 448(%rdi), %ymm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm17 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm17, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rsi), %xmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %xmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm14, %ymm1
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rcx), %xmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %xmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm3, %ymm20, %ymm3
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm12, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rcx), %xmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rsi), %xmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %xmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm17, %ymm28, %ymm17
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm17, %zmm1, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm12, %ymm14, %ymm12
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm1, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rcx), %xmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdx), %xmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %xmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %xmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm26, %ymm30, %ymm26
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm26, %zmm1, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm14, %ymm20, %ymm14
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm14, %zmm1, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rcx), %xmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdx), %xmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rsi), %xmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 256(%rdi), %xmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm29, %ymm31, %ymm29
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm29, %zmm1, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rcx), %xmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdx), %xmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rsi), %xmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 320(%rdi), %xmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm30, %ymm27, %ymm27
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm22, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm23 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm23, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm15 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rcx), %xmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdx), %xmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rsi), %xmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdi), %xmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm15, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm16 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm20, %zmm16, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rcx), %xmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdx), %xmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rsi), %xmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdi), %xmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm5, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm6 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm20, %zmm6, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 3776(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 3712(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, 3264(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, 3200(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, 2752(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, 2688(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 2240(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, 2176(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, 1728(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, 1664(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 1216(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 1152(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups (%rsp), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 704(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 640(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 4032(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3968(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3904(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3840(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, 3648(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 3584(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3520(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3456(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3392(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3328(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, 3136(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, 3072(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3008(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 2944(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 2880(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 2816(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 2624(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, 2560(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 2496(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 2432(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 2368(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 2304(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 2112(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, 2048(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1984(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1920(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1856(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1792(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, 1600(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, 1536(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1472(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1408(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1344(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1280(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, 1088(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, 1024(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 960(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 896(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 832(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 768(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 576(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 512(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    addq $5384, %rsp # imm = 0x1508
+; AVX512BW-ONLY-SLOW-NEXT:    vzeroupper
+; AVX512BW-ONLY-SLOW-NEXT:    retq
+;
+; AVX512BW-ONLY-FAST-LABEL: store_i64_stride8_vf64:
+; AVX512BW-ONLY-FAST:       # %bb.0:
+; AVX512BW-ONLY-FAST-NEXT:    subq $5384, %rsp # imm = 0x1508
+; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%r8), %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%r9), %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r10), %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r10), %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rax), %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rax), %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    movb $-64, %r11b
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %r11d, %k1
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm14, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm15 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm15, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm2, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm13, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm13, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm5, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm0, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm0, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm27 = [7,15,7,15]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm27 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm27, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm14, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm14, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm12, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm12, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm2, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm11, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm5, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm24, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm11, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%r10), %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rax), %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7]
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm14, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm14, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rdx), %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm5, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm12, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm12, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm2, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm11, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm10, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm11, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%r10), %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rax), %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm14, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%r8), %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%r9), %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdx), %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm12, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm11, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%r10), %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rax), %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm14, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%r8), %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%r9), %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rsi), %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rdx), %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rcx), %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm12, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm11, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rsi), %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm14, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rdx), %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rcx), %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm15, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %ymm4, %ymm24
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm13, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm3, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm27, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rdx), %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rcx), %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm15, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm13, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm27, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rdx), %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rcx), %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm27, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%r10), %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rax), %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm14, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%r8), %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%r9), %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm24, %zmm9, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm12, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm12, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm11, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm11, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rsi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm14, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm11, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%r10), %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rax), %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm4, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%r8), %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%r9), %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm11, %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%r10), %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rax), %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm14, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rsi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm5, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%r8), %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%r9), %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm11, %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm5, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm0, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm0, %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm5, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm1, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm2, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm2, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm2, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm2, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm5, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm1, %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm5, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm6, %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm2, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm1, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm1, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm5, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm6, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm2, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm1, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm5, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm2, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm1, %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm8, %zmm17, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm8, %zmm17, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm8, %zmm17, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm1, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm31, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm24, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm9, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm30, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rcx), %ymm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdx), %ymm1
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rsi), %ymm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdi), %ymm8
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %ymm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rdi), %ymm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rcx), %ymm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %ymm4
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm19, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rcx), %ymm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rdx), %ymm1
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rsi), %ymm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rdi), %ymm9
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm14, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rcx), %ymm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rdx), %ymm1
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rsi), %ymm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rdi), %ymm8
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm11, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 256(%rcx), %ymm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 256(%rdx), %ymm1
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 256(%rsi), %ymm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 256(%rdi), %ymm8
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm10, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 320(%rcx), %ymm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 320(%rdx), %ymm3
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 320(%rsi), %ymm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 320(%rdi), %ymm14
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm13 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 384(%rcx), %ymm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 384(%rdx), %ymm3
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 384(%rsi), %ymm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 384(%rdi), %ymm9
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm18, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm25 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm25, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm2 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 448(%rcx), %ymm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 448(%rdx), %ymm3
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 448(%rsi), %ymm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 448(%rdi), %ymm9
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm17 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm17, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rsi), %xmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdi), %xmm12
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm14, %ymm1
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rcx), %xmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %xmm12
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %xmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %xmm17
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm3, %ymm20, %ymm3
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm12, %ymm2
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rcx), %xmm12
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rdx), %xmm14
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rsi), %xmm20
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %xmm26
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm17, %ymm28, %ymm17
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm17, %zmm1, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm12, %ymm14, %ymm12
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm1, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rcx), %xmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdx), %xmm20
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %xmm28
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %xmm29
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm26, %ymm30, %ymm26
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm26, %zmm1, %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm14, %ymm20, %ymm14
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm14, %zmm1, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rcx), %xmm20
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rdx), %xmm28
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rsi), %xmm30
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 256(%rdi), %xmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm29, %ymm31, %ymm29
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm29, %zmm1, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rcx), %xmm20
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rdx), %xmm28
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rsi), %xmm31
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 320(%rdi), %xmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm30, %ymm27, %ymm27
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm27, %zmm22, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm23 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm23, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm15 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rcx), %xmm20
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rdx), %xmm23
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rsi), %xmm28
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rdi), %xmm30
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm27, %zmm15, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm16 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm20, %zmm16, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rcx), %xmm20
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rdx), %xmm23
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rsi), %xmm28
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rdi), %xmm30
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm27, %zmm5, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm6 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm20, %zmm6, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 3776(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 3712(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, 3264(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, 3200(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, 2752(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, 2688(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 2240(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, 2176(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, 1728(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, 1664(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 1216(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 1152(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups (%rsp), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 704(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 640(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 128(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 4032(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 3968(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 3904(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 3840(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 3648(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, 3584(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 3520(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 3456(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 3392(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 3328(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, 3136(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, 3072(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 3008(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 2944(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 2880(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 2816(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 2624(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, 2560(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 2496(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 2432(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 2368(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 2304(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 2112(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, 2048(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 1984(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 1920(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 1856(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 1792(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, 1600(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, 1536(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 1472(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 1408(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 1344(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 1280(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, 1088(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, 1024(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 960(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 896(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 832(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 768(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 576(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 512(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, (%rax)
+; AVX512BW-ONLY-FAST-NEXT:    addq $5384, %rsp # imm = 0x1508
+; AVX512BW-ONLY-FAST-NEXT:    vzeroupper
+; AVX512BW-ONLY-FAST-NEXT:    retq
+;
+; AVX512DQBW-SLOW-LABEL: store_i64_stride8_vf64:
+; AVX512DQBW-SLOW:       # %bb.0:
+; AVX512DQBW-SLOW-NEXT:    subq $5384, %rsp # imm = 0x1508
+; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r8), %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm24
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm25
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm22
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r9), %zmm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm23
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r10), %zmm21
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm26
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rax), %zmm19
+; AVX512DQBW-SLOW-NEXT:    movb $-64, %r11b
+; AVX512DQBW-SLOW-NEXT:    kmovd %r11d, %k1
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
+; AVX512DQBW-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm14, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12]
+; AVX512DQBW-SLOW-NEXT:    # ymm15 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm15, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
+; AVX512DQBW-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm12, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13]
+; AVX512DQBW-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm13, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm13, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14]
+; AVX512DQBW-SLOW-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm5, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15]
+; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm27 = [7,15,7,15]
+; AVX512DQBW-SLOW-NEXT:    # ymm27 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm27, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm26, %zmm14, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm14, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm26, %zmm12, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm12, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm11, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm5, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm24, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm11, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%r10), %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm18
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7]
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm14, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm14, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rdx), %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm5, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm12, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm12, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm11, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm10, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm22, %zmm11, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%r10), %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm14, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdx), %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm12, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm11, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%r10), %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rax), %zmm22
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm22, %zmm14, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%r8), %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%r9), %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rsi), %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rdx), %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rcx), %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm22, %zmm12, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm11, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rsi), %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm14, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rdx), %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rcx), %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm15, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %ymm4, %ymm24
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm13, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm3, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm27, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rdx), %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rcx), %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm15, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm27, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rdx), %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rcx), %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm27, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%r10), %zmm26
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rax), %zmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm14, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%r8), %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%r9), %zmm15
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm24, %zmm9, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm12, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm12, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm11, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm11, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rsi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm14, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm11, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%r10), %zmm29
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rax), %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm4, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%r8), %zmm25
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%r9), %zmm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm31
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm11, %zmm31
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%r10), %zmm28
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rax), %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm14, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rsi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%r8), %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%r9), %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm30
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm11, %zmm30
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm0, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512DQBW-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512DQBW-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512DQBW-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11]
+; AVX512DQBW-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm1, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm2, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm22, %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm22, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm22, %zmm2, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm22, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm1, %zmm26
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm5, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm6, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm1, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm5, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm6, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm2, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm1, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm2, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm1, %zmm28
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm8, %zmm17, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm8, %zmm17, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm8, %zmm17, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm1, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm31, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm24, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm9, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm30, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rcx), %ymm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdx), %ymm1
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rsi), %ymm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdi), %ymm8
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rsi), %ymm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rcx), %ymm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rdx), %ymm4
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm19, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm9 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rcx), %ymm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rdx), %ymm1
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rsi), %ymm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rdi), %ymm9
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm14, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rcx), %ymm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rdx), %ymm1
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rsi), %ymm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm8
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm11, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 256(%rcx), %ymm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 256(%rdx), %ymm1
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 256(%rsi), %ymm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 256(%rdi), %ymm8
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm10, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 320(%rcx), %ymm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 320(%rdx), %ymm3
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 320(%rsi), %ymm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 320(%rdi), %ymm14
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm13 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 384(%rcx), %ymm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 384(%rdx), %ymm3
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 384(%rsi), %ymm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 384(%rdi), %ymm9
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm18, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm25 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm25, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm2 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 448(%rcx), %ymm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 448(%rdx), %ymm3
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 448(%rsi), %ymm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 448(%rdi), %ymm9
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm17 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm17, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdx), %xmm3
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rsi), %xmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdi), %xmm12
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm14, %ymm1
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rcx), %xmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm12
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %xmm17
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm3, %ymm20, %ymm3
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm12, %ymm2
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rcx), %xmm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm14
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rsi), %xmm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rdi), %xmm26
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm17, %ymm28, %ymm17
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm17, %zmm1, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm12, %ymm14, %ymm12
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm1, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rcx), %xmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdx), %xmm20
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rsi), %xmm28
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdi), %xmm29
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm26, %ymm30, %ymm26
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm26, %zmm1, %zmm26
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm14, %ymm20, %ymm14
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm14, %zmm1, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rcx), %xmm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rdx), %xmm28
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rsi), %xmm30
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 256(%rdi), %xmm0
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm29, %ymm31, %ymm29
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm29, %zmm1, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rcx), %xmm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rdx), %xmm28
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rsi), %xmm31
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 320(%rdi), %xmm0
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm30, %ymm27, %ymm27
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm22, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm23 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm23, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm15 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rcx), %xmm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rdx), %xmm23
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rsi), %xmm28
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rdi), %xmm30
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm15, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm16 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm20, %zmm16, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rcx), %xmm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rdx), %xmm23
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rsi), %xmm28
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rdi), %xmm30
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm5, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm6 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm20, %zmm6, %zmm6
+; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, 3776(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, 3712(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, 3264(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, 3200(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, 2752(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, 2688(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, 2240(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, 2176(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, 1728(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, 1664(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 1216(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 1152(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups (%rsp), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 704(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 640(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 192(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 128(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 4032(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 3968(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 3904(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 3840(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, 3648(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, 3584(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 3520(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 3456(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 3392(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 3328(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, 3136(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, 3072(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 3008(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 2944(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 2880(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 2816(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, 2624(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, 2560(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 2496(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 2432(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 2368(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 2304(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, 2112(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, 2048(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 1984(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 1920(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 1856(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 1792(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, 1600(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, 1536(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 1472(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 1408(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 1344(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 1280(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, 1088(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, 1024(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 960(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 896(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 832(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 768(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, 576(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, 512(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, 64(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, (%rax)
+; AVX512DQBW-SLOW-NEXT:    addq $5384, %rsp # imm = 0x1508
+; AVX512DQBW-SLOW-NEXT:    vzeroupper
+; AVX512DQBW-SLOW-NEXT:    retq
+;
+; AVX512DQBW-FAST-LABEL: store_i64_stride8_vf64:
+; AVX512DQBW-FAST:       # %bb.0:
+; AVX512DQBW-FAST-NEXT:    subq $5384, %rsp # imm = 0x1508
+; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm16
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rsi), %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdx), %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rcx), %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r8), %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r8), %zmm24
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%r8), %zmm20
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r9), %zmm25
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%r9), %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r9), %zmm18
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r10), %zmm23
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r10), %zmm21
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rax), %zmm26
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rax), %zmm19
+; AVX512DQBW-FAST-NEXT:    movb $-64, %r11b
+; AVX512DQBW-FAST-NEXT:    kmovd %r11d, %k1
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
+; AVX512DQBW-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm14, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12]
+; AVX512DQBW-FAST-NEXT:    # ymm15 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm15, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
+; AVX512DQBW-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm12, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm0
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13]
+; AVX512DQBW-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm2, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm27
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm13, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm13, %zmm10
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14]
+; AVX512DQBW-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm5, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15]
+; AVX512DQBW-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm0, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm0, %zmm8
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm27 = [7,15,7,15]
+; AVX512DQBW-FAST-NEXT:    # ymm27 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm27, %zmm9
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm14, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm14, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm12, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm12, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm2, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm11, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm5, %zmm9
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm24, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm11, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%r10), %zmm9
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rax), %zmm18
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7]
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm14, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm14, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rdx), %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm5, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm12, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm12, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm2, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm11, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm10, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm11, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%r10), %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rax), %zmm20
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm14, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%r8), %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%r9), %zmm10
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdx), %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm12, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm11, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%r10), %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rax), %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm14, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%r8), %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%r9), %zmm2
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rsi), %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rdx), %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rcx), %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm12, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm11, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rsi), %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm14, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rdx), %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rcx), %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm15, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %ymm4, %ymm24
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm13, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm3, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm27, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rdx), %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rcx), %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm15, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm13, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm27, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rdx), %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rcx), %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm27, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%r10), %zmm26
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rax), %zmm16
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm14, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%r8), %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%r9), %zmm15
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm24, %zmm9, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm12, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm12, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm11, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm11, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rsi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm14, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm11, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%r10), %zmm29
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rax), %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm4, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%r8), %zmm25
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%r9), %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, %zmm31
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm11, %zmm31
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%r10), %zmm28
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rax), %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm14, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rsi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm1, %zmm5, %zmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm4, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm1, %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%r8), %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%r9), %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm30
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm11, %zmm30
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm1, %zmm5, %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, %zmm24
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm0, %zmm24
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm27
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm0, %zmm27
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512DQBW-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm5, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512DQBW-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512DQBW-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11]
+; AVX512DQBW-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm1, %zmm21
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm21
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm2, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm23
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm19
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm19
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm2, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm2, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm2, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm5, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm1, %zmm26
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm22
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm5, %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm23
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm6, %zmm23
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm2, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm1, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, %zmm20
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm20
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm1, %zmm29
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, %zmm15
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm5, %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, %zmm16
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm6, %zmm16
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, %zmm18
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm2, %zmm18
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm1, %zmm25
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm5, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm12
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm2, %zmm12
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm1, %zmm28
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm8, %zmm17, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm8, %zmm17, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm8, %zmm17, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm1, %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm31, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm24, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm9, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm30, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rcx), %ymm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdx), %ymm1
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rsi), %ymm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdi), %ymm8
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rsi), %ymm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rdi), %ymm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rcx), %ymm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rdx), %ymm4
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm19, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 (%rsp), %zmm9 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rcx), %ymm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rdx), %ymm1
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rsi), %ymm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rdi), %ymm9
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm14, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rcx), %ymm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rdx), %ymm1
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rsi), %ymm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rdi), %ymm8
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm11, %zmm24
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm21
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 256(%rcx), %ymm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa 256(%rdx), %ymm1
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQBW-FAST-NEXT:    vmovdqa 256(%rsi), %ymm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa 256(%rdi), %ymm8
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm10, %zmm19
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 320(%rcx), %ymm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa 320(%rdx), %ymm3
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512DQBW-FAST-NEXT:    vmovdqa 320(%rsi), %ymm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa 320(%rdi), %ymm14
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, %zmm13 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 384(%rcx), %ymm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa 384(%rdx), %ymm3
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512DQBW-FAST-NEXT:    vmovdqa 384(%rsi), %ymm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa 384(%rdi), %ymm9
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm18, %zmm18
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, %zmm25 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm25, %zmm25
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm2 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 448(%rcx), %ymm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa 448(%rdx), %ymm3
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX512DQBW-FAST-NEXT:    vmovdqa 448(%rsi), %ymm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa 448(%rdi), %ymm9
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm17 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm17, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdx), %xmm3
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rsi), %xmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdi), %xmm12
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm14, %ymm1
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rcx), %xmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rdx), %xmm12
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0]
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rsi), %xmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdi), %xmm17
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm3, %ymm20, %ymm3
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm12, %ymm2
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rcx), %xmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rdx), %xmm14
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rsi), %xmm20
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rdi), %xmm26
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm17, %ymm28, %ymm17
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm17, %zmm1, %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm12, %ymm14, %ymm12
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm1, %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rcx), %xmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdx), %xmm20
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rsi), %xmm28
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdi), %xmm29
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm26, %ymm30, %ymm26
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm26, %zmm1, %zmm26
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm14, %ymm20, %ymm14
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm14, %zmm1, %zmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rcx), %xmm20
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rdx), %xmm28
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rsi), %xmm30
+; AVX512DQBW-FAST-NEXT:    vmovdqa 256(%rdi), %xmm0
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm29, %ymm31, %ymm29
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm29, %zmm1, %zmm29
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rcx), %xmm20
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rdx), %xmm28
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rsi), %xmm31
+; AVX512DQBW-FAST-NEXT:    vmovdqa 320(%rdi), %xmm0
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm30, %ymm27, %ymm27
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm27, %zmm22, %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, %zmm23 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm23, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm15 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rcx), %xmm20
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rdx), %xmm23
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rsi), %xmm28
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rdi), %xmm30
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm27, %zmm15, %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, %zmm16 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm20, %zmm16, %zmm16
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rcx), %xmm20
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rdx), %xmm23
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rsi), %xmm28
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rdi), %xmm30
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm27, %zmm5, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, %zmm6 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm20, %zmm6, %zmm6
+; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, 3776(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, 3712(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, 3264(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, 3200(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, 2752(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, 2688(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, 2240(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, 2176(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, 1728(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, 1664(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 1216(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 1152(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups (%rsp), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 704(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 640(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 192(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 128(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 4032(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 3968(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 3904(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 3840(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, 3648(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, 3584(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 3520(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 3456(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 3392(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 3328(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, 3136(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm15, 3072(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 3008(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 2944(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 2880(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 2816(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, 2624(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, 2560(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 2496(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 2432(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 2368(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 2304(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, 2112(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, 2048(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 1984(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 1920(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 1856(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 1792(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm14, 1600(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, 1536(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 1472(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 1408(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 1344(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 1280(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, 1088(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, 1024(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 960(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 896(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 832(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 768(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, 576(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, 512(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, 64(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, (%rax)
+; AVX512DQBW-FAST-NEXT:    addq $5384, %rsp # imm = 0x1508
+; AVX512DQBW-FAST-NEXT:    vzeroupper
+; AVX512DQBW-FAST-NEXT:    retq
   %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64
   %in.vec1 = load <64 x i64>, ptr %in.vecptr1, align 64
   %in.vec2 = load <64 x i64>, ptr %in.vecptr2, align 64
@@ -10981,16 +21419,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE: {{.*}}
 ; AVX2-SLOW: {{.*}}
 ; AVX512BW-FAST: {{.*}}
-; AVX512BW-ONLY-FAST: {{.*}}
-; AVX512BW-ONLY-SLOW: {{.*}}
 ; AVX512BW-SLOW: {{.*}}
-; AVX512DQ-FAST: {{.*}}
-; AVX512DQ-SLOW: {{.*}}
-; AVX512DQBW-FAST: {{.*}}
-; AVX512DQBW-SLOW: {{.*}}
 ; AVX512F-FAST: {{.*}}
-; AVX512F-ONLY-FAST: {{.*}}
-; AVX512F-ONLY-SLOW: {{.*}}
 ; AVX512F-SLOW: {{.*}}
 ; FALLBACK0: {{.*}}
 ; FALLBACK1: {{.*}}

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
index 17ed73d9e3b16..164d8bef447a0 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
@@ -511,7 +511,8 @@ define void @store_i8_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-ONLY-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
 ; AVX2-ONLY-NEXT:    vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20]
 ; AVX2-ONLY-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm3
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-ONLY-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-ONLY-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
@@ -537,7 +538,8 @@ define void @store_i8_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
 ; AVX512F-NEXT:    vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20]
 ; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512F-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
@@ -992,7 +994,8 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-ONLY-NEXT:    vpalignr {{.*#+}} ymm4 = ymm12[5,6,7,8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4],ymm12[21,22,23,24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20]
 ; AVX2-ONLY-NEXT:    vpalignr {{.*#+}} ymm5 = ymm11[5,6,7,8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4],ymm11[21,22,23,24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20]
 ; AVX2-ONLY-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm6
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-ONLY-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-ONLY-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm7, %ymm6, %ymm6
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm7, %ymm2, %ymm2
@@ -1038,7 +1041,8 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    vpalignr {{.*#+}} ymm4 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20]
 ; AVX512F-NEXT:    vpalignr {{.*#+}} ymm5 = ymm5[5,6,7,8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4],ymm5[21,22,23,24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20]
 ; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm6
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512F-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm7, %ymm6, %ymm6
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512F-NEXT:    vpshufb %ymm7, %ymm2, %ymm2

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
index 04bc4e5da890f..4f2ee5d5985b0 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
@@ -342,7 +342,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX2-SLOW-NEXT:    shrq $48, %rax
 ; AVX2-SLOW-NEXT:    vmovd %eax, %xmm1
 ; AVX2-SLOW-NEXT:    vpbroadcastw %xmm1, %xmm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <255,255,0,255,255,255,255,0,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
 ; AVX2-SLOW-NEXT:    vmovq %xmm0, 32(%r9)
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm2, (%r9)
@@ -374,7 +374,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX2-FAST-NEXT:    shrq $48, %rax
 ; AVX2-FAST-NEXT:    vmovd %eax, %xmm1
 ; AVX2-FAST-NEXT:    vpbroadcastw %xmm1, %xmm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <255,255,0,255,255,255,255,0,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0]
 ; AVX2-FAST-NEXT:    vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
 ; AVX2-FAST-NEXT:    vmovq %xmm0, 32(%r9)
 ; AVX2-FAST-NEXT:    vmovdqa %ymm2, (%r9)
@@ -406,7 +406,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX2-FAST-PERLANE-NEXT:    shrq $48, %rax
 ; AVX2-FAST-PERLANE-NEXT:    vmovd %eax, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw %xmm1, %xmm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <255,255,0,255,255,255,255,0,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovq %xmm0, 32(%r9)
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm2, (%r9)
@@ -1592,7 +1592,8 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm3[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[2,2,3,3,6,6,7,7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <255,u,0,255,0,255,u,0,u,0,255,0,255,u,0,255,255,u,0,255,0,255,u,0,u,0,255,0,255,u,0,255>
+; AVX2-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255,255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255]
+; AVX2-SLOW-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3]
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u>
@@ -3293,12 +3294,14 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm5, %ymm15
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128>
+; AVX2-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128]
+; AVX2-SLOW-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm5, %ymm10, %ymm3
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3]
 ; AVX2-SLOW-NEXT:    vpor %ymm1, %ymm3, %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqa 32(%rcx), %ymm12
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128>
+; AVX2-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
+; AVX2-SLOW-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm12, %ymm4
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3]
 ; AVX2-SLOW-NEXT:    vmovdqa 32(%rdx), %ymm14
@@ -3336,7 +3339,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <3,3,3,u,4,4,4,4>
 ; AVX2-SLOW-NEXT:    vpermd %ymm15, %ymm2, %ymm5
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,0,u,u,u,u,1,u,u,u,u,2,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm6, %ymm10, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255>
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm1, %ymm5, %ymm0, %ymm0
@@ -3393,7 +3396,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    # xmm5 = mem[0,0,1,1]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm6, %ymm1, %ymm5, %ymm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <9,u,11,u,u,10,u,12,u,14,u,u,13,u,15,u,9,u,11,u,u,10,u,12,u,14,u,u,13,u,15,u>
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm5, %ymm12, %ymm6
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm14[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm11 = ymm11[2,2,3,3,6,6,7,7]
@@ -3404,7 +3407,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm12, %ymm4, %ymm3, %ymm3
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14>
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm4, %ymm10, %ymm5
 ; AVX2-SLOW-NEXT:    vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
 ; AVX2-SLOW-NEXT:    # ymm10 = mem[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15]
@@ -3505,7 +3508,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,29,26,128,28,128,30,128,28,29,128,31,128,29]
 ; AVX2-FAST-NEXT:    vpshufb %ymm0, %ymm11, %ymm1
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128]
+; AVX2-FAST-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm13, (%rsp) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpshufb %ymm7, %ymm13, %ymm2
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
@@ -3515,7 +3519,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpshufb %ymm10, %ymm3, %ymm2
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[2,2,3,3]
 ; AVX2-FAST-NEXT:    vmovdqa 32(%rsi), %ymm2
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30]
+; AVX2-FAST-NEXT:    # ymm14 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm14, %ymm2, %ymm5
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3]
 ; AVX2-FAST-NEXT:    vpor %ymm4, %ymm5, %ymm4
@@ -3547,11 +3552,13 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23]
 ; AVX2-FAST-NEXT:    vpshufb %ymm8, %ymm3, %ymm7
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128]
+; AVX2-FAST-NEXT:    # ymm10 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm10, %ymm2, %ymm14
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3]
 ; AVX2-FAST-NEXT:    vpor %ymm7, %ymm14, %ymm7
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
+; AVX2-FAST-NEXT:    # ymm14 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm14, %ymm13, %ymm13
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,2,3,3]
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25]
@@ -3579,7 +3586,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpblendvb %ymm13, %ymm6, %ymm7, %ymm7
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <3,3,3,u,4,4,4,4>
 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm6, %ymm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,0,u,u,u,u,1,u,u,u,u,2,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
 ; AVX2-FAST-NEXT:    vpshufb %ymm8, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255>
 ; AVX2-FAST-NEXT:    vpblendvb %ymm13, %ymm3, %ymm2, %ymm2
@@ -3715,7 +3722,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm10, %ymm9
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128]
+; AVX2-FAST-PERLANE-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm2, %ymm4, %ymm3
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm4, %ymm12
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3]
@@ -3725,7 +3733,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm3, %ymm5, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm6 = ymm4[2,2,3,3]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%rsi), %ymm4
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30]
+; AVX2-FAST-PERLANE-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm8, %ymm4, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm6, %ymm7, %ymm6
@@ -3763,7 +3772,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm10, %ymm14, %ymm10
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
+; AVX2-FAST-PERLANE-NEXT:    # ymm14 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm12, %ymm6
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm14, %ymm12, %ymm12
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,3,3]
@@ -3793,7 +3803,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm11, %ymm8, %ymm9, %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = <3,3,3,u,4,4,4,4>
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm5, %ymm9, %ymm5
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,0,u,u,u,u,1,u,u,u,u,2,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} ymm11 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm11, %ymm4, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255>
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm12, %ymm5, %ymm4, %ymm4
@@ -3915,7 +3925,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero,zero
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm23
 ; AVX512F-SLOW-NEXT:    vpshufb {{.*#+}} ymm12 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[27],zero,zero,ymm7[26],zero,ymm7[28],zero,ymm7[30],zero,zero,ymm7[29],zero,ymm7[31],zero,zero
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128>
+; AVX512F-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
+; AVX512F-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm2, %ymm14, %ymm15
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm2, %ymm31
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm12, %zmm15, %zmm18
@@ -3959,12 +3970,13 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vpandnq %ymm27, %ymm30, %ymm27
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm13, %ymm4, %ymm13
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm27, %zmm13, %zmm27
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <9,u,11,u,u,10,u,12,u,14,u,u,13,u,15,u,9,u,11,u,u,10,u,12,u,14,u,u,13,u,15,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm2, %ymm14, %ymm14
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm31, %ymm9
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm9, %ymm12, %ymm13
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm2, %ymm12, %ymm2
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25>
+; AVX512F-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
+; AVX512F-SLOW-NEXT:    # ymm12 = mem[0,1,0,1]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm12, %ymm7, %ymm7
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm12, %ymm6, %ymm12
 ; AVX512F-SLOW-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,zero,ymm6[26],zero,ymm6[28],zero,ymm6[30],zero,zero,ymm6[29],zero,ymm6[31],zero,zero
@@ -4146,11 +4158,12 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vpandnq %ymm28, %ymm29, %ymm28
 ; AVX512F-FAST-NEXT:    vpshufb %ymm3, %ymm5, %ymm3
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm28, %zmm3, %zmm28
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <9,u,11,u,u,10,u,12,u,14,u,u,13,u,15,u,9,u,11,u,u,10,u,12,u,14,u,u,13,u,15,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm0, %ymm14
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm12[19],zero,ymm12[21],zero,zero,ymm12[20],zero,ymm12[22],zero,ymm12[24],zero,zero,ymm12[23],zero
 ; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm12, %ymm2
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25>
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
+; AVX512F-FAST-NEXT:    # ymm12 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm17, %ymm3
 ; AVX512F-FAST-NEXT:    vpshufb %ymm12, %ymm3, %ymm3
 ; AVX512F-FAST-NEXT:    vpshufb %ymm12, %ymm7, %ymm12
@@ -4234,193 +4247,195 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vzeroupper
 ; AVX512F-FAST-NEXT:    retq
 ;
-; AVX512BW-SLOW-LABEL: store_i8_stride5_vf64:
-; AVX512BW-SLOW:       # %bb.0:
-; AVX512BW-SLOW-NEXT:    vmovdqa (%rcx), %ymm0
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <9,u,11,u,u,10,u,12,u,14,u,u,13,u,15,u,9,u,11,u,u,10,u,12,u,14,u,u,13,u,15,u>
-; AVX512BW-SLOW-NEXT:    vpshufb %ymm8, %ymm0, %ymm2
-; AVX512BW-SLOW-NEXT:    vmovdqa (%rdx), %ymm1
-; AVX512BW-SLOW-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14]
-; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7]
-; AVX512BW-SLOW-NEXT:    movl $693250386, %eax # imm = 0x29522952
-; AVX512BW-SLOW-NEXT:    kmovd %eax, %k1
-; AVX512BW-SLOW-NEXT:    vmovdqu8 %ymm3, %ymm2 {%k1}
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
-; AVX512BW-SLOW-NEXT:    vmovdqa 32(%rdx), %xmm6
-; AVX512BW-SLOW-NEXT:    vmovdqa 32(%rcx), %xmm12
-; AVX512BW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8>
-; AVX512BW-SLOW-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm5
-; AVX512BW-SLOW-NEXT:    vmovdqa (%rsi), %ymm2
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm16 = <11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14>
-; AVX512BW-SLOW-NEXT:    vpshufb %ymm16, %ymm2, %ymm4
-; AVX512BW-SLOW-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX512BW-SLOW-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15]
-; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7]
-; AVX512BW-SLOW-NEXT:    movl $1251232404, %eax # imm = 0x4A944A94
-; AVX512BW-SLOW-NEXT:    kmovd %eax, %k5
-; AVX512BW-SLOW-NEXT:    vmovdqu8 %ymm9, %ymm4 {%k5}
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3]
-; AVX512BW-SLOW-NEXT:    vmovdqa 32(%rsi), %xmm13
-; AVX512BW-SLOW-NEXT:    vmovdqa 32(%rdi), %xmm14
-; AVX512BW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm10 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13>
-; AVX512BW-SLOW-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1]
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm9, %zmm4, %zmm4
-; AVX512BW-SLOW-NEXT:    movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631
-; AVX512BW-SLOW-NEXT:    kmovq %rax, %k4
-; AVX512BW-SLOW-NEXT:    vmovdqu8 %zmm5, %zmm4 {%k4}
-; AVX512BW-SLOW-NEXT:    vmovdqa (%r8), %ymm5
-; AVX512BW-SLOW-NEXT:    vmovdqa 32(%r8), %ymm15
-; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} ymm9 = ymm5[2,2,3,3,6,6,7,7]
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
-; AVX512BW-SLOW-NEXT:    vmovdqa64 32(%r8), %xmm17
-; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} xmm11 = xmm17[0,0,1,1]
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1]
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm11, %zmm9, %zmm9
-; AVX512BW-SLOW-NEXT:    movabsq $2380225041768974402, %rax # imm = 0x2108421084210842
-; AVX512BW-SLOW-NEXT:    kmovq %rax, %k2
-; AVX512BW-SLOW-NEXT:    vmovdqu8 %zmm9, %zmm4 {%k2}
-; AVX512BW-SLOW-NEXT:    vmovdqa64 32(%rdx), %ymm23
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128]
-; AVX512BW-SLOW-NEXT:    vpshufb %ymm9, %ymm23, %ymm18
-; AVX512BW-SLOW-NEXT:    vmovdqa64 32(%rcx), %ymm24
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
-; AVX512BW-SLOW-NEXT:    vpshufb %ymm11, %ymm24, %ymm19
-; AVX512BW-SLOW-NEXT:    vporq %ymm18, %ymm19, %ymm18
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm19 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12>
-; AVX512BW-SLOW-NEXT:    vpshufb %xmm19, %xmm12, %xmm12
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm20 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128>
-; AVX512BW-SLOW-NEXT:    vpshufb %xmm20, %xmm6, %xmm6
-; AVX512BW-SLOW-NEXT:    vpor %xmm6, %xmm12, %xmm6
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm18, %zmm6, %zmm6
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm21 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11>
-; AVX512BW-SLOW-NEXT:    vpshufb %xmm21, %xmm14, %xmm12
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm22 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128>
-; AVX512BW-SLOW-NEXT:    vpshufb %xmm22, %xmm13, %xmm13
-; AVX512BW-SLOW-NEXT:    vpor %xmm12, %xmm13, %xmm12
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm14 = ymm12[0,0,1,1]
-; AVX512BW-SLOW-NEXT:    vmovdqa64 32(%rdi), %ymm25
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = <3,3,3,u,4,4,4,4>
-; AVX512BW-SLOW-NEXT:    vpermd %ymm25, %ymm12, %ymm18
-; AVX512BW-SLOW-NEXT:    vmovdqa64 32(%rsi), %ymm26
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,0,u,u,u,u,1,u,u,u,u,2,u,u,u,u>
-; AVX512BW-SLOW-NEXT:    movl $138547332, %eax # imm = 0x8421084
-; AVX512BW-SLOW-NEXT:    kmovd %eax, %k2
-; AVX512BW-SLOW-NEXT:    vpshufb %ymm13, %ymm26, %ymm18 {%k2}
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm18, %zmm14, %zmm14
-; AVX512BW-SLOW-NEXT:    movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318
-; AVX512BW-SLOW-NEXT:    kmovq %rax, %k3
-; AVX512BW-SLOW-NEXT:    vmovdqu8 %zmm14, %zmm6 {%k3}
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm14 = <3,3,3,3,u,4,4,4>
-; AVX512BW-SLOW-NEXT:    vpermd %ymm15, %ymm14, %ymm18
-; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} xmm17 = xmm17[1,1,2,2]
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm17 = ymm17[0,1,1,1]
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm18, %zmm17, %zmm17
-; AVX512BW-SLOW-NEXT:    movabsq $4760450083537948804, %rax # imm = 0x4210842108421084
-; AVX512BW-SLOW-NEXT:    kmovq %rax, %k6
-; AVX512BW-SLOW-NEXT:    vmovdqu8 %zmm17, %zmm6 {%k6}
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm17 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128>
-; AVX512BW-SLOW-NEXT:    vpshufb %ymm17, %ymm26, %ymm18
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm27 = ymm18[2,2,3,3]
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128]
-; AVX512BW-SLOW-NEXT:    vpshufb %ymm18, %ymm25, %ymm28
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3]
-; AVX512BW-SLOW-NEXT:    vporq %ymm27, %ymm28, %ymm27
-; AVX512BW-SLOW-NEXT:    vpshufb %ymm16, %ymm26, %ymm16
-; AVX512BW-SLOW-NEXT:    vpshufhw {{.*#+}} ymm25 = ymm25[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15]
-; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} ymm25 = ymm25[2,2,3,3,6,6,7,7]
-; AVX512BW-SLOW-NEXT:    vmovdqu8 %ymm25, %ymm16 {%k5}
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm16 = ymm16[2,2,3,3]
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm16, %zmm27, %zmm16
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm25 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
-; AVX512BW-SLOW-NEXT:    vpshufb %ymm25, %ymm23, %ymm26
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm26 = ymm26[2,2,3,3]
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm27 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128>
-; AVX512BW-SLOW-NEXT:    vpshufb %ymm27, %ymm24, %ymm28
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3]
-; AVX512BW-SLOW-NEXT:    vporq %ymm26, %ymm28, %ymm26
-; AVX512BW-SLOW-NEXT:    vpshufb %ymm8, %ymm24, %ymm8
-; AVX512BW-SLOW-NEXT:    vpshufhw {{.*#+}} ymm23 = ymm23[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14]
-; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} ymm23 = ymm23[2,2,3,3,6,6,7,7]
-; AVX512BW-SLOW-NEXT:    vmovdqu8 %ymm23, %ymm8 {%k1}
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm8, %zmm26, %zmm8
-; AVX512BW-SLOW-NEXT:    vmovdqu8 %zmm16, %zmm8 {%k4}
-; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} ymm16 = ymm15[0,2,1,1,4,6,5,5]
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm16 = ymm16[2,3,3,2]
-; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[2,2,3,3,6,6,7,7]
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3]
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm15, %zmm16, %zmm15
-; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdx), %xmm16
-; AVX512BW-SLOW-NEXT:    movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108
-; AVX512BW-SLOW-NEXT:    kmovq %rax, %k1
-; AVX512BW-SLOW-NEXT:    vmovdqu8 %zmm15, %zmm8 {%k1}
-; AVX512BW-SLOW-NEXT:    vmovdqa (%rcx), %xmm15
-; AVX512BW-SLOW-NEXT:    vpshufb %xmm19, %xmm15, %xmm19
-; AVX512BW-SLOW-NEXT:    vpshufb %xmm20, %xmm16, %xmm20
-; AVX512BW-SLOW-NEXT:    vporq %xmm19, %xmm20, %xmm19
-; AVX512BW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3],xmm15[4],xmm16[4],xmm15[5],xmm16[5],xmm15[6],xmm16[6],xmm15[7],xmm16[7]
-; AVX512BW-SLOW-NEXT:    vpshufb %xmm7, %xmm15, %xmm7
-; AVX512BW-SLOW-NEXT:    vmovdqa (%rsi), %xmm15
-; AVX512BW-SLOW-NEXT:    vinserti32x4 $2, %xmm19, %zmm7, %zmm7
-; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %xmm16
-; AVX512BW-SLOW-NEXT:    vpshufb %xmm21, %xmm16, %xmm19
-; AVX512BW-SLOW-NEXT:    vpshufb %xmm22, %xmm15, %xmm20
-; AVX512BW-SLOW-NEXT:    vporq %xmm19, %xmm20, %xmm19
-; AVX512BW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7]
-; AVX512BW-SLOW-NEXT:    vpshufb %xmm10, %xmm15, %xmm10
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} zmm7 = zmm7[0,0,1,1,4,4,5,5]
-; AVX512BW-SLOW-NEXT:    vinserti32x4 $2, %xmm19, %zmm10, %zmm10
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} zmm10 = zmm10[0,0,1,1,4,4,5,5]
-; AVX512BW-SLOW-NEXT:    movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C
-; AVX512BW-SLOW-NEXT:    kmovq %rax, %k1
-; AVX512BW-SLOW-NEXT:    vmovdqu8 %zmm7, %zmm10 {%k1}
-; AVX512BW-SLOW-NEXT:    vmovdqa (%r8), %xmm7
-; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} xmm15 = xmm7[0,0,1,1]
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1]
-; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2]
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,1,1]
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm7, %zmm15, %zmm7
-; AVX512BW-SLOW-NEXT:    movabsq $595056260442243600, %rax # imm = 0x842108421084210
-; AVX512BW-SLOW-NEXT:    kmovq %rax, %k1
-; AVX512BW-SLOW-NEXT:    vmovdqu8 %zmm7, %zmm10 {%k1}
-; AVX512BW-SLOW-NEXT:    vpshufb %ymm25, %ymm1, %ymm7
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
-; AVX512BW-SLOW-NEXT:    vpshufb %ymm27, %ymm0, %ymm15
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3]
-; AVX512BW-SLOW-NEXT:    vpor %ymm7, %ymm15, %ymm7
-; AVX512BW-SLOW-NEXT:    vpshufb %ymm9, %ymm1, %ymm1
-; AVX512BW-SLOW-NEXT:    vpshufb %ymm11, %ymm0, %ymm0
-; AVX512BW-SLOW-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vpermd %ymm3, %ymm12, %ymm1
-; AVX512BW-SLOW-NEXT:    vpshufb %ymm13, %ymm2, %ymm1 {%k2}
-; AVX512BW-SLOW-NEXT:    vpshufb %ymm17, %ymm2, %ymm2
-; AVX512BW-SLOW-NEXT:    vpshufb %ymm18, %ymm3, %ymm3
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3]
-; AVX512BW-SLOW-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512BW-SLOW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k3}
-; AVX512BW-SLOW-NEXT:    vpermd %ymm5, %ymm14, %ymm0
-; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm5[0,2,1,1,4,6,5,5]
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,3,2]
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    movabsq $1190112520884487201, %rax # imm = 0x1084210842108421
-; AVX512BW-SLOW-NEXT:    kmovq %rax, %k1
-; AVX512BW-SLOW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm1, 64(%r9)
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm10, (%r9)
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm8, 256(%r9)
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm6, 192(%r9)
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm4, 128(%r9)
-; AVX512BW-SLOW-NEXT:    vzeroupper
-; AVX512BW-SLOW-NEXT:    retq
+; AVX512BW-ONLY-SLOW-LABEL: store_i8_stride5_vf64:
+; AVX512BW-ONLY-SLOW:       # %bb.0:
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rcx), %ymm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12]
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %ymm8, %ymm0, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %ymm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14]
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7]
+; AVX512BW-ONLY-SLOW-NEXT:    movl $693250386, %eax # imm = 0x29522952
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %eax, %k1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu8 %ymm3, %ymm2 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 32(%rdx), %xmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 32(%rcx), %xmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8>
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rsi), %ymm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm16 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14]
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %ymm16, %ymm2, %ymm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %ymm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15]
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7]
+; AVX512BW-ONLY-SLOW-NEXT:    movl $1251232404, %eax # imm = 0x4A944A94
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %eax, %k5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu8 %ymm9, %ymm4 {%k5}
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 32(%rsi), %xmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 32(%rdi), %xmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm10 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13>
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm9, %zmm4, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631
+; AVX512BW-ONLY-SLOW-NEXT:    kmovq %rax, %k4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu8 %zmm5, %zmm4 {%k4}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%r8), %ymm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 32(%r8), %ymm15
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufd {{.*#+}} ymm9 = ymm5[2,2,3,3,6,6,7,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 32(%r8), %xmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufd {{.*#+}} xmm11 = xmm17[0,0,1,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm11, %zmm9, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    movabsq $2380225041768974402, %rax # imm = 0x2108421084210842
+; AVX512BW-ONLY-SLOW-NEXT:    kmovq %rax, %k2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu8 %zmm9, %zmm4 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 32(%rdx), %ymm23
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128]
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %ymm9, %ymm23, %ymm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 32(%rcx), %ymm24
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %ymm11, %ymm24, %ymm19
+; AVX512BW-ONLY-SLOW-NEXT:    vporq %ymm18, %ymm19, %ymm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm19 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12>
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %xmm19, %xmm12, %xmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm20 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128>
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %xmm20, %xmm6, %xmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpor %xmm6, %xmm12, %xmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm18, %zmm6, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm21 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11>
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %xmm21, %xmm14, %xmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm22 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128>
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %xmm22, %xmm13, %xmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpor %xmm12, %xmm13, %xmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm14 = ymm12[0,0,1,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 32(%rdi), %ymm25
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = <3,3,3,u,4,4,4,4>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermd %ymm25, %ymm12, %ymm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 32(%rsi), %ymm26
+; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm13 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
+; AVX512BW-ONLY-SLOW-NEXT:    movl $138547332, %eax # imm = 0x8421084
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %eax, %k2
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %ymm13, %ymm26, %ymm18 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm18, %zmm14, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318
+; AVX512BW-ONLY-SLOW-NEXT:    kmovq %rax, %k3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu8 %zmm14, %zmm6 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm14 = <3,3,3,3,u,4,4,4>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermd %ymm15, %ymm14, %ymm18
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufd {{.*#+}} xmm17 = xmm17[1,1,2,2]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm17 = ymm17[0,1,1,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm18, %zmm17, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    movabsq $4760450083537948804, %rax # imm = 0x4210842108421084
+; AVX512BW-ONLY-SLOW-NEXT:    kmovq %rax, %k6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu8 %zmm17, %zmm6 {%k6}
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm17 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %ymm17, %ymm26, %ymm18
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm27 = ymm18[2,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128]
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %ymm18, %ymm25, %ymm28
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vporq %ymm27, %ymm28, %ymm27
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %ymm16, %ymm26, %ymm16
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufhw {{.*#+}} ymm25 = ymm25[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15]
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufd {{.*#+}} ymm25 = ymm25[2,2,3,3,6,6,7,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu8 %ymm25, %ymm16 {%k5}
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm16 = ymm16[2,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm16, %zmm27, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm25 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %ymm25, %ymm23, %ymm26
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm26 = ymm26[2,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm27 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm27 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %ymm27, %ymm24, %ymm28
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vporq %ymm26, %ymm28, %ymm26
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %ymm8, %ymm24, %ymm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufhw {{.*#+}} ymm23 = ymm23[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14]
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufd {{.*#+}} ymm23 = ymm23[2,2,3,3,6,6,7,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu8 %ymm23, %ymm8 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm8, %zmm26, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu8 %zmm16, %zmm8 {%k4}
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufd {{.*#+}} ymm16 = ymm15[0,2,1,1,4,6,5,5]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm16 = ymm16[2,3,3,2]
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[2,2,3,3,6,6,7,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm15, %zmm16, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %xmm16
+; AVX512BW-ONLY-SLOW-NEXT:    movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108
+; AVX512BW-ONLY-SLOW-NEXT:    kmovq %rax, %k1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu8 %zmm15, %zmm8 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rcx), %xmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %xmm19, %xmm15, %xmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %xmm20, %xmm16, %xmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vporq %xmm19, %xmm20, %xmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3],xmm15[4],xmm16[4],xmm15[5],xmm16[5],xmm15[6],xmm16[6],xmm15[7],xmm16[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %xmm7, %xmm15, %xmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rsi), %xmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $2, %xmm19, %zmm7, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %xmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %xmm21, %xmm16, %xmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %xmm22, %xmm15, %xmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vporq %xmm19, %xmm20, %xmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %xmm10, %xmm15, %xmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} zmm7 = zmm7[0,0,1,1,4,4,5,5]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $2, %xmm19, %zmm10, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} zmm10 = zmm10[0,0,1,1,4,4,5,5]
+; AVX512BW-ONLY-SLOW-NEXT:    movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C
+; AVX512BW-ONLY-SLOW-NEXT:    kmovq %rax, %k1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu8 %zmm7, %zmm10 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%r8), %xmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufd {{.*#+}} xmm15 = xmm7[0,0,1,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,1,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm7, %zmm15, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    movabsq $595056260442243600, %rax # imm = 0x842108421084210
+; AVX512BW-ONLY-SLOW-NEXT:    kmovq %rax, %k1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu8 %zmm7, %zmm10 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %ymm25, %ymm1, %ymm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %ymm27, %ymm0, %ymm15
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpor %ymm7, %ymm15, %ymm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %ymm9, %ymm1, %ymm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %ymm11, %ymm0, %ymm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermd %ymm3, %ymm12, %ymm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %ymm13, %ymm2, %ymm1 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %ymm17, %ymm2, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %ymm18, %ymm3, %ymm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vpermd %ymm5, %ymm14, %ymm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm5[0,2,1,1,4,6,5,5]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,3,2]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    movabsq $1190112520884487201, %rax # imm = 0x1084210842108421
+; AVX512BW-ONLY-SLOW-NEXT:    kmovq %rax, %k1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 64(%r9)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, (%r9)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, 256(%r9)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, 192(%r9)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 128(%r9)
+; AVX512BW-ONLY-SLOW-NEXT:    vzeroupper
+; AVX512BW-ONLY-SLOW-NEXT:    retq
 ;
 ; AVX512BW-FAST-LABEL: store_i8_stride5_vf64:
 ; AVX512BW-FAST:       # %bb.0:
@@ -4457,7 +4472,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <3,3,3,u,4,4,4,4>
 ; AVX512BW-FAST-NEXT:    vpermd %ymm21, %ymm3, %ymm22
 ; AVX512BW-FAST-NEXT:    vmovdqa64 32(%rsi), %ymm23
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,0,u,u,u,u,1,u,u,u,u,2,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
 ; AVX512BW-FAST-NEXT:    movl $138547332, %eax # imm = 0x8421084
 ; AVX512BW-FAST-NEXT:    kmovd %eax, %k1
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm4, %ymm23, %ymm22 {%k1}
@@ -4576,6 +4591,196 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, 192(%r9)
 ; AVX512BW-FAST-NEXT:    vzeroupper
 ; AVX512BW-FAST-NEXT:    retq
+;
+; AVX512DQBW-SLOW-LABEL: store_i8_stride5_vf64:
+; AVX512DQBW-SLOW:       # %bb.0:
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rcx), %ymm0
+; AVX512DQBW-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12]
+; AVX512DQBW-SLOW-NEXT:    vpshufb %ymm8, %ymm0, %ymm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdx), %ymm1
+; AVX512DQBW-SLOW-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14]
+; AVX512DQBW-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7]
+; AVX512DQBW-SLOW-NEXT:    movl $693250386, %eax # imm = 0x29522952
+; AVX512DQBW-SLOW-NEXT:    kmovd %eax, %k1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu8 %ymm3, %ymm2 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 32(%rdx), %xmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 32(%rcx), %xmm12
+; AVX512DQBW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8>
+; AVX512DQBW-SLOW-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rsi), %ymm2
+; AVX512DQBW-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm16 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14]
+; AVX512DQBW-SLOW-NEXT:    vpshufb %ymm16, %ymm2, %ymm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdi), %ymm3
+; AVX512DQBW-SLOW-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15]
+; AVX512DQBW-SLOW-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7]
+; AVX512DQBW-SLOW-NEXT:    movl $1251232404, %eax # imm = 0x4A944A94
+; AVX512DQBW-SLOW-NEXT:    kmovd %eax, %k5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu8 %ymm9, %ymm4 {%k5}
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 32(%rsi), %xmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 32(%rdi), %xmm14
+; AVX512DQBW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm10 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13>
+; AVX512DQBW-SLOW-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm9, %zmm4, %zmm4
+; AVX512DQBW-SLOW-NEXT:    movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631
+; AVX512DQBW-SLOW-NEXT:    kmovq %rax, %k4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu8 %zmm5, %zmm4 {%k4}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%r8), %ymm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 32(%r8), %ymm15
+; AVX512DQBW-SLOW-NEXT:    vpshufd {{.*#+}} ymm9 = ymm5[2,2,3,3,6,6,7,7]
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 32(%r8), %xmm17
+; AVX512DQBW-SLOW-NEXT:    vpshufd {{.*#+}} xmm11 = xmm17[0,0,1,1]
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm11, %zmm9, %zmm9
+; AVX512DQBW-SLOW-NEXT:    movabsq $2380225041768974402, %rax # imm = 0x2108421084210842
+; AVX512DQBW-SLOW-NEXT:    kmovq %rax, %k2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu8 %zmm9, %zmm4 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 32(%rdx), %ymm23
+; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128]
+; AVX512DQBW-SLOW-NEXT:    vpshufb %ymm9, %ymm23, %ymm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 32(%rcx), %ymm24
+; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
+; AVX512DQBW-SLOW-NEXT:    vpshufb %ymm11, %ymm24, %ymm19
+; AVX512DQBW-SLOW-NEXT:    vporq %ymm18, %ymm19, %ymm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm19 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12>
+; AVX512DQBW-SLOW-NEXT:    vpshufb %xmm19, %xmm12, %xmm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm20 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128>
+; AVX512DQBW-SLOW-NEXT:    vpshufb %xmm20, %xmm6, %xmm6
+; AVX512DQBW-SLOW-NEXT:    vpor %xmm6, %xmm12, %xmm6
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm18, %zmm6, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm21 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11>
+; AVX512DQBW-SLOW-NEXT:    vpshufb %xmm21, %xmm14, %xmm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm22 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128>
+; AVX512DQBW-SLOW-NEXT:    vpshufb %xmm22, %xmm13, %xmm13
+; AVX512DQBW-SLOW-NEXT:    vpor %xmm12, %xmm13, %xmm12
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm14 = ymm12[0,0,1,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 32(%rdi), %ymm25
+; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = <3,3,3,u,4,4,4,4>
+; AVX512DQBW-SLOW-NEXT:    vpermd %ymm25, %ymm12, %ymm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 32(%rsi), %ymm26
+; AVX512DQBW-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm13 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
+; AVX512DQBW-SLOW-NEXT:    movl $138547332, %eax # imm = 0x8421084
+; AVX512DQBW-SLOW-NEXT:    kmovd %eax, %k2
+; AVX512DQBW-SLOW-NEXT:    vpshufb %ymm13, %ymm26, %ymm18 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm18, %zmm14, %zmm14
+; AVX512DQBW-SLOW-NEXT:    movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318
+; AVX512DQBW-SLOW-NEXT:    kmovq %rax, %k3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu8 %zmm14, %zmm6 {%k3}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm14 = <3,3,3,3,u,4,4,4>
+; AVX512DQBW-SLOW-NEXT:    vpermd %ymm15, %ymm14, %ymm18
+; AVX512DQBW-SLOW-NEXT:    vpshufd {{.*#+}} xmm17 = xmm17[1,1,2,2]
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm17 = ymm17[0,1,1,1]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm18, %zmm17, %zmm17
+; AVX512DQBW-SLOW-NEXT:    movabsq $4760450083537948804, %rax # imm = 0x4210842108421084
+; AVX512DQBW-SLOW-NEXT:    kmovq %rax, %k6
+; AVX512DQBW-SLOW-NEXT:    vmovdqu8 %zmm17, %zmm6 {%k6}
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm17 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128]
+; AVX512DQBW-SLOW-NEXT:    # ymm17 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vpshufb %ymm17, %ymm26, %ymm18
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm27 = ymm18[2,2,3,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128]
+; AVX512DQBW-SLOW-NEXT:    vpshufb %ymm18, %ymm25, %ymm28
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3]
+; AVX512DQBW-SLOW-NEXT:    vporq %ymm27, %ymm28, %ymm27
+; AVX512DQBW-SLOW-NEXT:    vpshufb %ymm16, %ymm26, %ymm16
+; AVX512DQBW-SLOW-NEXT:    vpshufhw {{.*#+}} ymm25 = ymm25[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15]
+; AVX512DQBW-SLOW-NEXT:    vpshufd {{.*#+}} ymm25 = ymm25[2,2,3,3,6,6,7,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu8 %ymm25, %ymm16 {%k5}
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm16 = ymm16[2,2,3,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm16, %zmm27, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm25 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
+; AVX512DQBW-SLOW-NEXT:    vpshufb %ymm25, %ymm23, %ymm26
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm26 = ymm26[2,2,3,3]
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm27 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
+; AVX512DQBW-SLOW-NEXT:    # ymm27 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vpshufb %ymm27, %ymm24, %ymm28
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3]
+; AVX512DQBW-SLOW-NEXT:    vporq %ymm26, %ymm28, %ymm26
+; AVX512DQBW-SLOW-NEXT:    vpshufb %ymm8, %ymm24, %ymm8
+; AVX512DQBW-SLOW-NEXT:    vpshufhw {{.*#+}} ymm23 = ymm23[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14]
+; AVX512DQBW-SLOW-NEXT:    vpshufd {{.*#+}} ymm23 = ymm23[2,2,3,3,6,6,7,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu8 %ymm23, %ymm8 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm8, %zmm26, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqu8 %zmm16, %zmm8 {%k4}
+; AVX512DQBW-SLOW-NEXT:    vpshufd {{.*#+}} ymm16 = ymm15[0,2,1,1,4,6,5,5]
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm16 = ymm16[2,3,3,2]
+; AVX512DQBW-SLOW-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[2,2,3,3,6,6,7,7]
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm15, %zmm16, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdx), %xmm16
+; AVX512DQBW-SLOW-NEXT:    movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108
+; AVX512DQBW-SLOW-NEXT:    kmovq %rax, %k1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu8 %zmm15, %zmm8 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rcx), %xmm15
+; AVX512DQBW-SLOW-NEXT:    vpshufb %xmm19, %xmm15, %xmm19
+; AVX512DQBW-SLOW-NEXT:    vpshufb %xmm20, %xmm16, %xmm20
+; AVX512DQBW-SLOW-NEXT:    vporq %xmm19, %xmm20, %xmm19
+; AVX512DQBW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3],xmm15[4],xmm16[4],xmm15[5],xmm16[5],xmm15[6],xmm16[6],xmm15[7],xmm16[7]
+; AVX512DQBW-SLOW-NEXT:    vpshufb %xmm7, %xmm15, %xmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rsi), %xmm15
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $2, %xmm19, %zmm7, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdi), %xmm16
+; AVX512DQBW-SLOW-NEXT:    vpshufb %xmm21, %xmm16, %xmm19
+; AVX512DQBW-SLOW-NEXT:    vpshufb %xmm22, %xmm15, %xmm20
+; AVX512DQBW-SLOW-NEXT:    vporq %xmm19, %xmm20, %xmm19
+; AVX512DQBW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7]
+; AVX512DQBW-SLOW-NEXT:    vpshufb %xmm10, %xmm15, %xmm10
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} zmm7 = zmm7[0,0,1,1,4,4,5,5]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $2, %xmm19, %zmm10, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} zmm10 = zmm10[0,0,1,1,4,4,5,5]
+; AVX512DQBW-SLOW-NEXT:    movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C
+; AVX512DQBW-SLOW-NEXT:    kmovq %rax, %k1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu8 %zmm7, %zmm10 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%r8), %xmm7
+; AVX512DQBW-SLOW-NEXT:    vpshufd {{.*#+}} xmm15 = xmm7[0,0,1,1]
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1]
+; AVX512DQBW-SLOW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2]
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,1,1]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm7, %zmm15, %zmm7
+; AVX512DQBW-SLOW-NEXT:    movabsq $595056260442243600, %rax # imm = 0x842108421084210
+; AVX512DQBW-SLOW-NEXT:    kmovq %rax, %k1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu8 %zmm7, %zmm10 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpshufb %ymm25, %ymm1, %ymm7
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
+; AVX512DQBW-SLOW-NEXT:    vpshufb %ymm27, %ymm0, %ymm15
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3]
+; AVX512DQBW-SLOW-NEXT:    vpor %ymm7, %ymm15, %ymm7
+; AVX512DQBW-SLOW-NEXT:    vpshufb %ymm9, %ymm1, %ymm1
+; AVX512DQBW-SLOW-NEXT:    vpshufb %ymm11, %ymm0, %ymm0
+; AVX512DQBW-SLOW-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermd %ymm3, %ymm12, %ymm1
+; AVX512DQBW-SLOW-NEXT:    vpshufb %ymm13, %ymm2, %ymm1 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vpshufb %ymm17, %ymm2, %ymm2
+; AVX512DQBW-SLOW-NEXT:    vpshufb %ymm18, %ymm3, %ymm3
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3]
+; AVX512DQBW-SLOW-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k3}
+; AVX512DQBW-SLOW-NEXT:    vpermd %ymm5, %ymm14, %ymm0
+; AVX512DQBW-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm5[0,2,1,1,4,6,5,5]
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,3,2]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQBW-SLOW-NEXT:    movabsq $1190112520884487201, %rax # imm = 0x1084210842108421
+; AVX512DQBW-SLOW-NEXT:    kmovq %rax, %k1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, 64(%r9)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, (%r9)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, 256(%r9)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, 192(%r9)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, 128(%r9)
+; AVX512DQBW-SLOW-NEXT:    vzeroupper
+; AVX512DQBW-SLOW-NEXT:    retq
   %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64
   %in.vec1 = load <64 x i8>, ptr %in.vecptr1, align 64
   %in.vec2 = load <64 x i8>, ptr %in.vecptr2, align 64
@@ -4595,11 +4800,9 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2: {{.*}}
 ; AVX512: {{.*}}
 ; AVX512BW-ONLY-FAST: {{.*}}
-; AVX512BW-ONLY-SLOW: {{.*}}
 ; AVX512DQ-FAST: {{.*}}
 ; AVX512DQ-SLOW: {{.*}}
 ; AVX512DQBW-FAST: {{.*}}
-; AVX512DQBW-SLOW: {{.*}}
 ; AVX512F-ONLY-FAST: {{.*}}
 ; AVX512F-ONLY-SLOW: {{.*}}
 ; FALLBACK0: {{.*}}

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
index e8ca42820850f..bb052c6fa70d9 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
@@ -1286,7 +1286,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vmovdqa (%rcx), %ymm3
 ; AVX2-SLOW-NEXT:    vmovdqa (%r8), %ymm4
 ; AVX2-SLOW-NEXT:    vmovdqa (%rcx), %xmm6
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm5, %xmm6, %xmm8
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm6, %xmm7
 ; AVX2-SLOW-NEXT:    vmovdqa (%rdx), %xmm6
@@ -1294,7 +1294,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1]
 ; AVX2-SLOW-NEXT:    vmovdqa (%rsi), %xmm9
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u>
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm8, %xmm9, %xmm10
 ; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm11
 ; AVX2-SLOW-NEXT:    vpshufb %xmm8, %xmm11, %xmm8
@@ -1308,12 +1308,12 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1]
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm13, %ymm8, %ymm12, %ymm14
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm8, %ymm3, %ymm12
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm3, %ymm5
 ; AVX2-SLOW-NEXT:    vpshufb %ymm8, %ymm1, %ymm8
 ; AVX2-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm12 = ymm8[0],ymm12[0],ymm8[1],ymm12[1],ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[4],ymm12[4],ymm8[5],ymm12[5],ymm8[6],ymm12[6],ymm8[7],ymm12[7],ymm8[16],ymm12[16],ymm8[17],ymm12[17],ymm8[18],ymm12[18],ymm8[19],ymm12[19],ymm8[20],ymm12[20],ymm8[21],ymm12[21],ymm8[22],ymm12[22],ymm8[23],ymm12[23]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm8, %ymm2, %ymm15
 ; AVX2-SLOW-NEXT:    vpshufb %ymm8, %ymm0, %ymm8
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm0, %ymm1
@@ -1423,14 +1423,14 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovdqa (%r8), %ymm6
 ; AVX2-FAST-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vmovdqa (%rcx), %xmm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
 ; AVX2-FAST-NEXT:    vpshufb %xmm7, %xmm0, %xmm8
 ; AVX2-FAST-NEXT:    vmovdqa (%rdx), %xmm5
 ; AVX2-FAST-NEXT:    vpshufb %xmm7, %xmm5, %xmm7
 ; AVX2-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm8 = ymm7[0,0,0,1]
 ; AVX2-FAST-NEXT:    vmovdqa (%rsi), %xmm7
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
 ; AVX2-FAST-NEXT:    vpshufb %xmm10, %xmm7, %xmm11
 ; AVX2-FAST-NEXT:    vmovdqa %xmm7, %xmm9
 ; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm7
@@ -1445,11 +1445,11 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1]
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm13, %ymm10, %ymm12, %ymm14
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
 ; AVX2-FAST-NEXT:    vpshufb %ymm10, %ymm4, %ymm12
 ; AVX2-FAST-NEXT:    vpshufb %ymm10, %ymm2, %ymm10
 ; AVX2-FAST-NEXT:    vpunpcklbw {{.*#+}} ymm12 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[4],ymm12[4],ymm10[5],ymm12[5],ymm10[6],ymm12[6],ymm10[7],ymm12[7],ymm10[16],ymm12[16],ymm10[17],ymm12[17],ymm10[18],ymm12[18],ymm10[19],ymm12[19],ymm10[20],ymm12[20],ymm10[21],ymm12[21],ymm10[22],ymm12[22],ymm10[23],ymm12[23]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
 ; AVX2-FAST-NEXT:    vpshufb %ymm10, %ymm3, %ymm15
 ; AVX2-FAST-NEXT:    vpshufb %ymm10, %ymm1, %ymm10
 ; AVX2-FAST-NEXT:    vpunpcklbw {{.*#+}} ymm15 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[2],ymm15[2],ymm10[3],ymm15[3],ymm10[4],ymm15[4],ymm10[5],ymm15[5],ymm10[6],ymm15[6],ymm10[7],ymm15[7],ymm10[16],ymm15[16],ymm10[17],ymm15[17],ymm10[18],ymm15[18],ymm10[19],ymm15[19],ymm10[20],ymm15[20],ymm10[21],ymm15[21],ymm10[22],ymm15[22],ymm10[23],ymm15[23]
@@ -1554,14 +1554,14 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%r8), %ymm6
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rcx), %xmm0
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm7 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm7, %xmm0, %xmm8
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdx), %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm7, %xmm5, %xmm7
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm8 = ymm7[0,0,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rsi), %xmm7
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm10, %xmm7, %xmm11
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm7, %xmm9
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm7
@@ -1576,11 +1576,11 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm13, %ymm10, %ymm12, %ymm14
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm10, %ymm4, %ymm12
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm10, %ymm2, %ymm10
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklbw {{.*#+}} ymm12 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[4],ymm12[4],ymm10[5],ymm12[5],ymm10[6],ymm12[6],ymm10[7],ymm12[7],ymm10[16],ymm12[16],ymm10[17],ymm12[17],ymm10[18],ymm12[18],ymm10[19],ymm12[19],ymm10[20],ymm12[20],ymm10[21],ymm12[21],ymm10[22],ymm12[22],ymm10[23],ymm12[23]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm10, %ymm3, %ymm15
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm10, %ymm1, %ymm10
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklbw {{.*#+}} ymm15 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[2],ymm15[2],ymm10[3],ymm15[3],ymm10[4],ymm15[4],ymm10[5],ymm15[5],ymm10[6],ymm15[6],ymm10[7],ymm15[7],ymm10[16],ymm15[16],ymm10[17],ymm15[17],ymm10[18],ymm15[18],ymm10[19],ymm15[19],ymm10[20],ymm15[20],ymm10[21],ymm15[21],ymm10[22],ymm15[22],ymm10[23],ymm15[23]
@@ -1717,7 +1717,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm15, %zmm7, %zmm7
 ; AVX512F-SLOW-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm7
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm14 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm14, %ymm12, %ymm15
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm14, %ymm10, %ymm14
 ; AVX512F-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm14 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[16],ymm15[16],ymm14[17],ymm15[17],ymm14[18],ymm15[18],ymm14[19],ymm15[19],ymm14[20],ymm15[20],ymm14[21],ymm15[21],ymm14[22],ymm15[22],ymm14[23],ymm15[23]
@@ -1729,7 +1729,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm15, %zmm15, %zmm16
 ; AVX512F-SLOW-NEXT:    vpandq %zmm16, %zmm14, %zmm14
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm0, %ymm13, %ymm13
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm0, %ymm11, %ymm0
 ; AVX512F-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm13[0],ymm0[1],ymm13[1],ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[4],ymm13[4],ymm0[5],ymm13[5],ymm0[6],ymm13[6],ymm0[7],ymm13[7],ymm0[16],ymm13[16],ymm0[17],ymm13[17],ymm0[18],ymm13[18],ymm0[19],ymm13[19],ymm0[20],ymm13[20],ymm0[21],ymm13[21],ymm0[22],ymm13[22],ymm0[23],ymm13[23]
@@ -1738,7 +1738,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm0
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7]
 ; AVX512F-SLOW-NEXT:    vpternlogq $186, %zmm14, %zmm15, %zmm0
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u,6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm10, %ymm9, %ymm11
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm10, %ymm8, %ymm10
 ; AVX512F-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm10 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[16],ymm11[16],ymm10[17],ymm11[17],ymm10[18],ymm11[18],ymm10[19],ymm11[19],ymm10[20],ymm11[20],ymm10[21],ymm11[21],ymm10[22],ymm11[22],ymm10[23],ymm11[23]
@@ -1747,7 +1747,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm8, %zmm10, %zmm8
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[2,2,2,3,6,6,6,7]
 ; AVX512F-SLOW-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm3, %xmm9
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX512F-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15]
@@ -1755,7 +1755,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vprold $16, %xmm9, %xmm9
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm9, %zmm0
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm9 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
 ; AVX512F-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
@@ -1765,7 +1765,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[0,0,0,1,4,4,4,5]
 ; AVX512F-SLOW-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm5, %xmm2
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
 ; AVX512F-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
@@ -1820,7 +1820,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm15, %zmm7, %zmm7
 ; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm7
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm14, %ymm12, %ymm15
 ; AVX512F-FAST-NEXT:    vpshufb %ymm14, %ymm10, %ymm14
 ; AVX512F-FAST-NEXT:    vpunpcklbw {{.*#+}} ymm14 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[16],ymm15[16],ymm14[17],ymm15[17],ymm14[18],ymm15[18],ymm14[19],ymm15[19],ymm14[20],ymm15[20],ymm14[21],ymm15[21],ymm14[22],ymm15[22],ymm14[23],ymm15[23]
@@ -1832,7 +1832,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm15, %zmm15, %zmm16
 ; AVX512F-FAST-NEXT:    vpandq %zmm16, %zmm14, %zmm14
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm13, %ymm13
 ; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm11, %ymm2
 ; AVX512F-FAST-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm13[0],ymm2[1],ymm13[1],ymm2[2],ymm13[2],ymm2[3],ymm13[3],ymm2[4],ymm13[4],ymm2[5],ymm13[5],ymm2[6],ymm13[6],ymm2[7],ymm13[7],ymm2[16],ymm13[16],ymm2[17],ymm13[17],ymm2[18],ymm13[18],ymm2[19],ymm13[19],ymm2[20],ymm13[20],ymm2[21],ymm13[21],ymm2[22],ymm13[22],ymm2[23],ymm13[23]
@@ -1841,7 +1841,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm10, %zmm2, %zmm2
 ; AVX512F-FAST-NEXT:    vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7]
 ; AVX512F-FAST-NEXT:    vpternlogq $186, %zmm14, %zmm15, %zmm2
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u,6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm10, %ymm9, %ymm11
 ; AVX512F-FAST-NEXT:    vpshufb %ymm10, %ymm8, %ymm10
 ; AVX512F-FAST-NEXT:    vpunpcklbw {{.*#+}} ymm10 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[16],ymm11[16],ymm10[17],ymm11[17],ymm10[18],ymm11[18],ymm10[19],ymm11[19],ymm10[20],ymm11[20],ymm10[21],ymm11[21],ymm10[22],ymm11[22],ymm10[23],ymm11[23]
@@ -1850,7 +1850,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm8, %zmm10, %zmm8
 ; AVX512F-FAST-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[2,2,2,3,6,6,6,7]
 ; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm8
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm9
 ; AVX512F-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
 ; AVX512F-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
@@ -1858,7 +1858,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11]
 ; AVX512F-FAST-NEXT:    vinserti32x4 $2, %xmm2, %zmm9, %zmm2
 ; AVX512F-FAST-NEXT:    vpermq {{.*#+}} zmm2 = zmm2[0,0,0,1,4,4,4,5]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm9, %xmm4, %xmm4
 ; AVX512F-FAST-NEXT:    vpshufb %xmm9, %xmm5, %xmm5
 ; AVX512F-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
@@ -1867,7 +1867,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
 ; AVX512F-FAST-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5]
 ; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm1, %xmm3, %xmm2
 ; AVX512F-FAST-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
 ; AVX512F-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
@@ -1892,14 +1892,14 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-SLOW-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX512BW-SLOW-NEXT:    vmovdqa (%r9), %ymm1
 ; AVX512BW-SLOW-NEXT:    vmovdqa (%rsi), %xmm7
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm6, %xmm7, %xmm8
 ; AVX512BW-SLOW-NEXT:    vmovdqa (%rdi), %xmm9
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm6, %xmm9, %xmm6
 ; AVX512BW-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15]
 ; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1]
 ; AVX512BW-SLOW-NEXT:    vmovdqa (%rcx), %xmm8
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm10 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm10, %xmm8, %xmm11
 ; AVX512BW-SLOW-NEXT:    vmovdqa (%rdx), %xmm12
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm10, %xmm12, %xmm10
@@ -1920,7 +1920,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-SLOW-NEXT:    vmovdqu16 %ymm11, %ymm10 {%k2}
 ; AVX512BW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm10[0,1,2,3],zmm6[4,5,6,7]
 ; AVX512BW-SLOW-NEXT:    vmovdqa (%r9), %xmm10
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm11 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm11 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm11, %xmm10, %xmm13
 ; AVX512BW-SLOW-NEXT:    vmovdqa (%r8), %xmm14
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm11, %xmm14, %xmm11
@@ -1962,19 +1962,19 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
 ; AVX512BW-SLOW-NEXT:    vpermw %ymm8, %ymm10, %ymm9 {%k1}
 ; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm8
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
 ; AVX512BW-SLOW-NEXT:    vpshufb %ymm9, %ymm5, %ymm5
 ; AVX512BW-SLOW-NEXT:    vpshufb %ymm9, %ymm4, %ymm4
 ; AVX512BW-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23]
 ; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
 ; AVX512BW-SLOW-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
 ; AVX512BW-SLOW-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
 ; AVX512BW-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
 ; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
 ; AVX512BW-SLOW-NEXT:    vmovdqu16 %ymm4, %ymm2 {%k1}
 ; AVX512BW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm8[4,5,6,7]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u,6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
 ; AVX512BW-SLOW-NEXT:    vpshufb %ymm3, %ymm1, %ymm4
 ; AVX512BW-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm3
 ; AVX512BW-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23]
@@ -2013,12 +2013,12 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FAST-NEXT:    movw $9362, %cx # imm = 0x2492
 ; AVX512BW-FAST-NEXT:    kmovd %ecx, %k2
 ; AVX512BW-FAST-NEXT:    vpermw %ymm6, %ymm12, %ymm11 {%k2}
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm6, %xmm9, %xmm12
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm6, %xmm10, %xmm6
 ; AVX512BW-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm12[8],xmm6[9],xmm12[9],xmm6[10],xmm12[10],xmm6[11],xmm12[11],xmm6[12],xmm12[12],xmm6[13],xmm12[13],xmm6[14],xmm12[14],xmm6[15],xmm12[15]
 ; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm12, %xmm7, %xmm13
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm12, %xmm8, %xmm12
 ; AVX512BW-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
@@ -2029,7 +2029,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm12, %zmm0, %zmm6
 ; AVX512BW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm11[0,1,2,3],zmm6[4,5,6,7]
 ; AVX512BW-FAST-NEXT:    vmovdqa (%r9), %xmm11
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm12, %xmm11, %xmm13
 ; AVX512BW-FAST-NEXT:    vmovdqa (%r8), %xmm14
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm12, %xmm14, %xmm12
@@ -2070,19 +2070,19 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
 ; AVX512BW-FAST-NEXT:    vpermw %ymm8, %ymm10, %ymm9 {%k1}
 ; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm8
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm9, %ymm5, %ymm5
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm9, %ymm4, %ymm4
 ; AVX512BW-FAST-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23]
 ; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
 ; AVX512BW-FAST-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
 ; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
 ; AVX512BW-FAST-NEXT:    vmovdqu16 %ymm4, %ymm2 {%k1}
 ; AVX512BW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm8[4,5,6,7]
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u,6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm3, %ymm1, %ymm4
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm3
 ; AVX512BW-FAST-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23]
@@ -3107,7 +3107,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-SLOW-NEXT:    vmovdqa 32(%rcx), %xmm7
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vmovdqa (%rdx), %xmm1
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -3117,7 +3117,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
 ; AVX2-SLOW-NEXT:    vmovdqa (%rsi), %xmm14
 ; AVX2-SLOW-NEXT:    vmovdqa 32(%rsi), %xmm12
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u>
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm14, %xmm1
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm13
@@ -3140,7 +3140,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm1, %ymm2, %ymm3, %ymm2
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm5, %ymm4
 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm6, %ymm5
 ; AVX2-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[16],ymm4[16],ymm5[17],ymm4[17],ymm5[18],ymm4[18],ymm5[19],ymm4[19],ymm5[20],ymm4[20],ymm5[21],ymm4[21],ymm5[22],ymm4[22],ymm5[23],ymm4[23]
@@ -3148,7 +3148,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vmovdqa 32(%rsi), %ymm6
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm5, %ymm6, %ymm6
 ; AVX2-SLOW-NEXT:    vpshufb %ymm5, %ymm7, %ymm7
 ; AVX2-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23]
@@ -3186,7 +3186,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm8, %ymm2, %ymm3, %ymm3
 ; AVX2-SLOW-NEXT:    vmovdqa 32(%r8), %ymm5
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0]
+; AVX2-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm2, %ymm5, %ymm6
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm8, %ymm4, %ymm6, %ymm10
@@ -3212,7 +3213,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vmovdqa 32(%r9), %ymm2
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u>
+; AVX2-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0]
+; AVX2-SLOW-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm0, %ymm2, %ymm3
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm15, %ymm10, %ymm3, %ymm2
@@ -3281,7 +3283,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshufb %xmm10, %xmm4, %xmm10
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm0, %ymm5, %ymm10, %ymm10
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u>
+; AVX2-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
+; AVX2-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %ymm2, %ymm5, %ymm14
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3]
@@ -3301,7 +3304,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm14, %ymm10, %ymm1, %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u>
+; AVX2-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
+; AVX2-SLOW-NEXT:    # ymm10 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %ymm10, %ymm6, %ymm1
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
@@ -3343,10 +3347,12 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
 ; AVX2-SLOW-NEXT:    # ymm14 = ymm2[8],mem[8],ymm2[9],mem[9],ymm2[10],mem[10],ymm2[11],mem[11],ymm2[12],mem[12],ymm2[13],mem[13],ymm2[14],mem[14],ymm2[15],mem[15],ymm2[24],mem[24],ymm2[25],mem[25],ymm2[26],mem[26],ymm2[27],mem[27],ymm2[28],mem[28],ymm2[29],mem[29],ymm2[30],mem[30],ymm2[31],mem[31]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31>
+; AVX2-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31]
+; AVX2-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm2, %ymm10, %ymm10
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31>
+; AVX2-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31]
+; AVX2-SLOW-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm14, %ymm14
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm0, %ymm10, %ymm14, %ymm10
@@ -3366,7 +3372,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm13, %xmm2
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm12, %ymm1, %ymm2, %ymm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u>
+; AVX2-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0]
+; AVX2-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm2, %ymm5, %ymm4
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm12, %ymm10, %ymm4, %ymm4
@@ -3383,7 +3390,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm9, %xmm2
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm10, %ymm1, %ymm2, %ymm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15>
+; AVX2-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15]
+; AVX2-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm2, %ymm6, %ymm8
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm10, %ymm4, %ymm8, %ymm4
@@ -3425,7 +3433,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovdqa (%rcx), %xmm0
 ; AVX2-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FAST-NEXT:    vmovdqa 32(%rcx), %xmm9
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vmovdqa (%rdx), %xmm1
 ; AVX2-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -3436,7 +3444,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovdqa (%rsi), %xmm1
 ; AVX2-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FAST-NEXT:    vmovdqa 32(%rsi), %xmm10
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm4
 ; AVX2-FAST-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -3459,7 +3467,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm1, %ymm2, %ymm3, %ymm2
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm5, %ymm4
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm6, %ymm5
 ; AVX2-FAST-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[16],ymm4[16],ymm5[17],ymm4[17],ymm5[18],ymm4[18],ymm5[19],ymm4[19],ymm5[20],ymm4[20],ymm5[21],ymm4[21],ymm5[22],ymm4[22],ymm5[23],ymm4[23]
@@ -3467,7 +3475,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vmovdqa 32(%rsi), %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
 ; AVX2-FAST-NEXT:    vpshufb %ymm5, %ymm0, %ymm6
 ; AVX2-FAST-NEXT:    vpshufb %ymm5, %ymm7, %ymm7
 ; AVX2-FAST-NEXT:    vpunpcklbw {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23]
@@ -3504,7 +3512,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpblendvb %ymm6, %ymm2, %ymm3, %ymm2
 ; AVX2-FAST-NEXT:    vmovdqa 32(%r8), %ymm5
 ; AVX2-FAST-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0]
+; AVX2-FAST-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm5, %ymm5
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
@@ -3530,7 +3539,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vmovdqa 32(%r9), %ymm2
 ; AVX2-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0]
+; AVX2-FAST-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
@@ -3565,13 +3575,15 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-FAST-NEXT:    # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31]
+; AVX2-FAST-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,2,2,3]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-FAST-NEXT:    # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31]
+; AVX2-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm5, %ymm1, %ymm0, %ymm1
@@ -3595,7 +3607,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm15, %xmm2
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0]
+; AVX2-FAST-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm9, %ymm15
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
@@ -3615,7 +3628,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm5, %ymm2, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15]
+; AVX2-FAST-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm4, %ymm10, %ymm2
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
@@ -3657,10 +3671,12 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload
 ; AVX2-FAST-NEXT:    # ymm15 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[4],mem[4],ymm2[5],mem[5],ymm2[6],mem[6],ymm2[7],mem[7],ymm2[16],mem[16],ymm2[17],mem[17],ymm2[18],mem[18],ymm2[19],mem[19],ymm2[20],mem[20],ymm2[21],mem[21],ymm2[22],mem[22],ymm2[23],mem[23]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27]
+; AVX2-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm2, %ymm5, %ymm5
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25]
+; AVX2-FAST-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm15, %ymm15
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm0, %ymm5, %ymm15, %ymm5
@@ -3681,7 +3697,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm14, %xmm2
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm15, %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
+; AVX2-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm2, %ymm9, %ymm4
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm15, %ymm5, %ymm4, %ymm4
@@ -3697,7 +3714,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm13, %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
+; AVX2-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm2, %ymm10, %ymm5
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm13, %ymm4, %ymm5, %ymm4
@@ -3739,7 +3757,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rcx), %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%rcx), %xmm9
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdx), %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -3750,7 +3768,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rsi), %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%rsi), %xmm10
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -3773,7 +3791,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm1, %ymm2, %ymm3, %ymm2
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm3 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm3, %ymm5, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm3, %ymm6, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[16],ymm4[16],ymm5[17],ymm4[17],ymm5[18],ymm4[18],ymm5[19],ymm4[19],ymm5[20],ymm4[20],ymm5[21],ymm4[21],ymm5[22],ymm4[22],ymm5[23],ymm4[23]
@@ -3781,7 +3799,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%rsi), %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm5 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm5, %ymm0, %ymm6
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm5, %ymm7, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklbw {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23]
@@ -3818,7 +3836,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm6, %ymm2, %ymm3, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%r8), %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm3 = <6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0]
+; AVX2-FAST-PERLANE-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm3, %ymm5, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
@@ -3844,7 +3863,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%r9), %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0]
+; AVX2-FAST-PERLANE-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
@@ -3879,13 +3899,15 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31]
+; AVX2-FAST-PERLANE-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,2,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31]
+; AVX2-FAST-PERLANE-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm5, %ymm1, %ymm0, %ymm1
@@ -3909,7 +3931,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm15, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm3 = <10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0]
+; AVX2-FAST-PERLANE-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm3, %ymm9, %ymm15
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
@@ -3929,7 +3952,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm5, %ymm2, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15]
+; AVX2-FAST-PERLANE-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm4, %ymm10, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
@@ -3971,10 +3995,12 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    # ymm15 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[4],mem[4],ymm2[5],mem[5],ymm2[6],mem[6],ymm2[7],mem[7],ymm2[16],mem[16],ymm2[17],mem[17],ymm2[18],mem[18],ymm2[19],mem[19],ymm2[20],mem[20],ymm2[21],mem[21],ymm2[22],mem[22],ymm2[23],mem[23]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27]
+; AVX2-FAST-PERLANE-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm2, %ymm5, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25]
+; AVX2-FAST-PERLANE-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm3, %ymm15, %ymm15
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm0, %ymm5, %ymm15, %ymm5
@@ -3995,7 +4021,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm14, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm15, %ymm1, %ymm2, %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
+; AVX2-FAST-PERLANE-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm2, %ymm9, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm15, %ymm5, %ymm4, %ymm4
@@ -4011,7 +4038,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm13, %ymm1, %ymm2, %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
+; AVX2-FAST-PERLANE-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm2, %ymm10, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm13, %ymm4, %ymm5, %ymm4
@@ -4056,7 +4084,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%rdx), %ymm7
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%rsi), %ymm13
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%rdi), %ymm4
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm10, %xmm8, %xmm0
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm10, %xmm2, %xmm1
 ; AVX512F-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
@@ -4087,30 +4115,34 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm3, %xmm28
 ; AVX512F-SLOW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm11 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm11, %ymm13, %ymm0
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm11, %ymm4, %ymm1
 ; AVX512F-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 ; AVX512F-SLOW-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm4[8],ymm13[8],ymm4[9],ymm13[9],ymm4[10],ymm13[10],ymm4[11],ymm13[11],ymm4[12],ymm13[12],ymm4[13],ymm13[13],ymm4[14],ymm13[14],ymm4[15],ymm13[15],ymm4[24],ymm13[24],ymm4[25],ymm13[25],ymm4[26],ymm13[26],ymm4[27],ymm13[27],ymm4[28],ymm13[28],ymm4[29],ymm13[29],ymm4[30],ymm13[30],ymm4[31],ymm13[31]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm4, %ymm29
 ; AVX512F-SLOW-NEXT:    vmovdqa %ymm13, %ymm4
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31>
+; AVX512F-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31]
+; AVX512F-SLOW-NEXT:    # ymm12 = mem[0,1,0,1]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm12, %ymm1, %ymm1
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm13 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm13, %ymm6, %ymm0
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm13, %ymm7, %ymm1
 ; AVX512F-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 ; AVX512F-SLOW-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15],ymm7[24],ymm6[24],ymm7[25],ymm6[25],ymm7[26],ymm6[26],ymm7[27],ymm6[27],ymm7[28],ymm6[28],ymm7[29],ymm6[29],ymm7[30],ymm6[30],ymm7[31],ymm6[31]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31>
+; AVX512F-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31]
+; AVX512F-SLOW-NEXT:    # ymm14 = mem[0,1,0,1]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm14, %ymm1, %ymm1
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%r9), %ymm10
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15>
+; AVX512F-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15]
+; AVX512F-SLOW-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm3, %ymm10, %ymm0
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0]
+; AVX512F-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm2, %ymm10, %ymm15
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm15, %zmm25
 ; AVX512F-SLOW-NEXT:    vmovdqa (%rsi), %ymm15
@@ -4165,11 +4197,12 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm27, %xmm1
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm1, %ymm28
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u>
+; AVX512F-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
+; AVX512F-SLOW-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm4, %ymm12, %ymm12
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm4, %ymm10, %ymm1
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm1, %ymm20
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm10 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm10, %xmm3, %xmm1
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm10, %xmm0, %xmm9
 ; AVX512F-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7]
@@ -4196,13 +4229,16 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm1, %xmm3, %xmm0
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm0, %ymm16
 ; AVX512F-SLOW-NEXT:    vmovdqa (%r8), %ymm3
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u>
+; AVX512F-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
+; AVX512F-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm2, %ymm3, %ymm1
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%r8), %ymm0
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm2, %ymm0, %ymm5
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u>
+; AVX512F-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0]
+; AVX512F-SLOW-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm4, %ymm0, %ymm6
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0]
+; AVX512F-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm4, %ymm3, %ymm4
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
@@ -4334,32 +4370,36 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%rsi), %ymm7
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%rdi), %ymm8
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%r9), %ymm12
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm7, %ymm0
 ; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm8, %ymm1
 ; AVX512F-FAST-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 ; AVX512F-FAST-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15],ymm8[24],ymm7[24],ymm8[25],ymm7[25],ymm8[26],ymm7[26],ymm8[27],ymm7[27],ymm8[28],ymm7[28],ymm8[29],ymm7[29],ymm8[30],ymm7[30],ymm8[31],ymm7[31]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm8, %ymm26
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm7, %ymm27
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31>
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31]
+; AVX512F-FAST-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm9, %ymm2, %ymm0
 ; AVX512F-FAST-NEXT:    vpshufb %ymm9, %ymm6, %ymm1
 ; AVX512F-FAST-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 ; AVX512F-FAST-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15],ymm6[24],ymm2[24],ymm6[25],ymm2[25],ymm6[26],ymm2[26],ymm6[27],ymm2[27],ymm6[28],ymm2[28],ymm6[29],ymm2[29],ymm6[30],ymm2[30],ymm6[31],ymm2[31]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm6, %ymm28
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm2, %ymm29
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31>
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31]
+; AVX512F-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm11, %ymm1, %ymm1
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15>
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15]
+; AVX512F-FAST-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vmovdqa %ymm12, %ymm6
 ; AVX512F-FAST-NEXT:    vpshufb %ymm0, %ymm12, %ymm2
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u>
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0]
+; AVX512F-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm1, %ymm12, %ymm12
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm6, %ymm30
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm2, %zmm12, %zmm2
@@ -4387,7 +4427,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vpshufb %ymm0, %ymm10, %ymm0
 ; AVX512F-FAST-NEXT:    vpshufb %ymm1, %ymm10, %ymm1
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm24
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm12, %xmm7, %xmm0
 ; AVX512F-FAST-NEXT:    vpshufb %xmm12, %xmm8, %xmm1
 ; AVX512F-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
@@ -4439,7 +4479,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm26, %ymm0
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm27, %ymm1
 ; AVX512F-FAST-NEXT:    vpunpcklbw {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27>
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27]
+; AVX512F-FAST-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm7, %ymm5, %ymm8
 ; AVX512F-FAST-NEXT:    vpshufb %ymm7, %ymm6, %ymm0
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm0, %ymm18
@@ -4447,25 +4488,29 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vpshufb %xmm7, %xmm12, %xmm12
 ; AVX512F-FAST-NEXT:    vpshufb %xmm7, %xmm11, %xmm0
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm0, %ymm19
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u>
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
+; AVX512F-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm11, %ymm10, %ymm10
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm30, %ymm0
 ; AVX512F-FAST-NEXT:    vpshufb %ymm11, %ymm0, %ymm0
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm0, %ymm30
 ; AVX512F-FAST-NEXT:    vmovdqa (%r8), %ymm1
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u>
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
+; AVX512F-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm1, %ymm7
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%r8), %ymm0
 ; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm0, %ymm11
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u>
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0]
+; AVX512F-FAST-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm6, %ymm0, %ymm15
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm15, %ymm17
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0]
+; AVX512F-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm0, %ymm15
 ; AVX512F-FAST-NEXT:    vpshufb %ymm6, %ymm1, %ymm6
 ; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm1, %ymm0
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm0, %ymm16
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm5, %xmm3, %xmm0
 ; AVX512F-FAST-NEXT:    vpshufb %xmm5, %xmm2, %xmm1
 ; AVX512F-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
@@ -4599,221 +4644,223 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vzeroupper
 ; AVX512F-FAST-NEXT:    retq
 ;
-; AVX512BW-SLOW-LABEL: store_i8_stride6_vf64:
-; AVX512BW-SLOW:       # %bb.0:
-; AVX512BW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-SLOW-NEXT:    vmovdqa64 (%r8), %zmm14
-; AVX512BW-SLOW-NEXT:    vmovdqa64 (%r9), %zmm12
-; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rsi), %ymm16
-; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdi), %ymm17
-; AVX512BW-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm17[0],ymm16[0],ymm17[1],ymm16[1],ymm17[2],ymm16[2],ymm17[3],ymm16[3],ymm17[4],ymm16[4],ymm17[5],ymm16[5],ymm17[6],ymm16[6],ymm17[7],ymm16[7],ymm17[16],ymm16[16],ymm17[17],ymm16[17],ymm17[18],ymm16[18],ymm17[19],ymm16[19],ymm17[20],ymm16[20],ymm17[21],ymm16[21],ymm17[22],ymm16[22],ymm17[23],ymm16[23]
-; AVX512BW-SLOW-NEXT:    vmovdqa (%rsi), %xmm1
-; AVX512BW-SLOW-NEXT:    vmovdqa 32(%rsi), %xmm9
-; AVX512BW-SLOW-NEXT:    vmovdqa (%rdi), %xmm3
-; AVX512BW-SLOW-NEXT:    vmovdqa 32(%rdi), %xmm11
-; AVX512BW-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29]
-; AVX512BW-SLOW-NEXT:    vpermw %zmm0, %zmm7, %zmm0
-; AVX512BW-SLOW-NEXT:    vmovdqa (%rcx), %xmm2
-; AVX512BW-SLOW-NEXT:    vmovdqa 32(%rcx), %xmm8
-; AVX512BW-SLOW-NEXT:    vmovdqa (%rdx), %xmm4
-; AVX512BW-SLOW-NEXT:    vmovdqa 32(%rdx), %xmm10
-; AVX512BW-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7]
-; AVX512BW-SLOW-NEXT:    vpermw %ymm5, %ymm20, %ymm5
-; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rcx), %ymm18
-; AVX512BW-SLOW-NEXT:    vmovdqa64 (%rdx), %ymm19
-; AVX512BW-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm6 = ymm19[0],ymm18[0],ymm19[1],ymm18[1],ymm19[2],ymm18[2],ymm19[3],ymm18[3],ymm19[4],ymm18[4],ymm19[5],ymm18[5],ymm19[6],ymm18[6],ymm19[7],ymm18[7],ymm19[16],ymm18[16],ymm19[17],ymm18[17],ymm19[18],ymm18[18],ymm19[19],ymm18[19],ymm19[20],ymm18[20],ymm19[21],ymm18[21],ymm19[22],ymm18[22],ymm19[23],ymm18[23]
-; AVX512BW-SLOW-NEXT:    vprold $16, %ymm6, %ymm6
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm5
-; AVX512BW-SLOW-NEXT:    movl $613566756, %r10d # imm = 0x24924924
-; AVX512BW-SLOW-NEXT:    kmovd %r10d, %k1
-; AVX512BW-SLOW-NEXT:    vmovdqu16 %zmm5, %zmm0 {%k1}
-; AVX512BW-SLOW-NEXT:    vmovdqa (%r8), %xmm5
-; AVX512BW-SLOW-NEXT:    vmovdqa 32(%r8), %xmm13
-; AVX512BW-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm23 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7]
-; AVX512BW-SLOW-NEXT:    vpermw %ymm6, %ymm23, %ymm6
-; AVX512BW-SLOW-NEXT:    vmovdqa64 (%r8), %ymm21
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm24 = <2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u>
-; AVX512BW-SLOW-NEXT:    vpshufb %ymm24, %ymm21, %ymm15
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm15, %zmm6, %zmm6
-; AVX512BW-SLOW-NEXT:    movl $1227133513, %r10d # imm = 0x49249249
-; AVX512BW-SLOW-NEXT:    kmovd %r10d, %k2
-; AVX512BW-SLOW-NEXT:    vmovdqu16 %zmm6, %zmm0 {%k2}
-; AVX512BW-SLOW-NEXT:    vmovdqa (%r9), %xmm6
-; AVX512BW-SLOW-NEXT:    vmovdqa 32(%r9), %xmm15
-; AVX512BW-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm22 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512BW-SLOW-NEXT:    vpermw %ymm22, %ymm23, %ymm25
-; AVX512BW-SLOW-NEXT:    vmovdqa64 (%r9), %ymm22
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm26 = <u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u>
-; AVX512BW-SLOW-NEXT:    vpshufb %ymm26, %ymm22, %ymm27
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm27 = ymm27[2,2,2,3]
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm27, %zmm25, %zmm25
-; AVX512BW-SLOW-NEXT:    movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082
-; AVX512BW-SLOW-NEXT:    kmovq %r10, %k3
-; AVX512BW-SLOW-NEXT:    vmovdqu8 %zmm25, %zmm0 {%k3}
-; AVX512BW-SLOW-NEXT:    vmovdqa64 32(%rsi), %ymm25
-; AVX512BW-SLOW-NEXT:    vmovdqa64 32(%rdi), %ymm27
-; AVX512BW-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm28 = ymm27[0],ymm25[0],ymm27[1],ymm25[1],ymm27[2],ymm25[2],ymm27[3],ymm25[3],ymm27[4],ymm25[4],ymm27[5],ymm25[5],ymm27[6],ymm25[6],ymm27[7],ymm25[7],ymm27[16],ymm25[16],ymm27[17],ymm25[17],ymm27[18],ymm25[18],ymm27[19],ymm25[19],ymm27[20],ymm25[20],ymm27[21],ymm25[21],ymm27[22],ymm25[22],ymm27[23],ymm25[23]
-; AVX512BW-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm29 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15]
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm28, %zmm29, %zmm28
-; AVX512BW-SLOW-NEXT:    vpermw %zmm28, %zmm7, %zmm7
-; AVX512BW-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm28 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15]
-; AVX512BW-SLOW-NEXT:    vpermw %ymm28, %ymm20, %ymm20
-; AVX512BW-SLOW-NEXT:    vmovdqa64 32(%rcx), %ymm28
-; AVX512BW-SLOW-NEXT:    vmovdqa64 32(%rdx), %ymm29
-; AVX512BW-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm30 = ymm29[0],ymm28[0],ymm29[1],ymm28[1],ymm29[2],ymm28[2],ymm29[3],ymm28[3],ymm29[4],ymm28[4],ymm29[5],ymm28[5],ymm29[6],ymm28[6],ymm29[7],ymm28[7],ymm29[16],ymm28[16],ymm29[17],ymm28[17],ymm29[18],ymm28[18],ymm29[19],ymm28[19],ymm29[20],ymm28[20],ymm29[21],ymm28[21],ymm29[22],ymm28[22],ymm29[23],ymm28[23]
-; AVX512BW-SLOW-NEXT:    vprold $16, %ymm30, %ymm30
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm30 = ymm30[2,2,2,3]
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm30, %zmm20, %zmm20
-; AVX512BW-SLOW-NEXT:    vmovdqu16 %zmm20, %zmm7 {%k1}
-; AVX512BW-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512BW-SLOW-NEXT:    vpermw %ymm20, %ymm23, %ymm20
-; AVX512BW-SLOW-NEXT:    vmovdqa64 32(%r8), %ymm30
-; AVX512BW-SLOW-NEXT:    vpshufb %ymm24, %ymm30, %ymm24
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm24 = ymm24[2,2,2,3]
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm24, %zmm20, %zmm20
-; AVX512BW-SLOW-NEXT:    vmovdqu16 %zmm20, %zmm7 {%k2}
-; AVX512BW-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm15[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512BW-SLOW-NEXT:    vpermw %ymm20, %ymm23, %ymm20
-; AVX512BW-SLOW-NEXT:    vmovdqa64 32(%r9), %ymm23
-; AVX512BW-SLOW-NEXT:    vpshufb %ymm26, %ymm23, %ymm24
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm24 = ymm24[2,2,2,3]
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm24, %zmm20, %zmm20
-; AVX512BW-SLOW-NEXT:    vmovdqu8 %zmm20, %zmm7 {%k3}
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm24 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
-; AVX512BW-SLOW-NEXT:    vpshufb %ymm24, %ymm25, %ymm20
-; AVX512BW-SLOW-NEXT:    vpshufb %ymm24, %ymm27, %ymm26
-; AVX512BW-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm20 = ymm26[0],ymm20[0],ymm26[1],ymm20[1],ymm26[2],ymm20[2],ymm26[3],ymm20[3],ymm26[4],ymm20[4],ymm26[5],ymm20[5],ymm26[6],ymm20[6],ymm26[7],ymm20[7],ymm26[16],ymm20[16],ymm26[17],ymm20[17],ymm26[18],ymm20[18],ymm26[19],ymm20[19],ymm26[20],ymm20[20],ymm26[21],ymm20[21],ymm26[22],ymm20[22],ymm26[23],ymm20[23]
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3]
-; AVX512BW-SLOW-NEXT:    vpunpckhbw {{.*#+}} ymm25 = ymm27[8],ymm25[8],ymm27[9],ymm25[9],ymm27[10],ymm25[10],ymm27[11],ymm25[11],ymm27[12],ymm25[12],ymm27[13],ymm25[13],ymm27[14],ymm25[14],ymm27[15],ymm25[15],ymm27[24],ymm25[24],ymm27[25],ymm25[25],ymm27[26],ymm25[26],ymm27[27],ymm25[27],ymm27[28],ymm25[28],ymm27[29],ymm25[29],ymm27[30],ymm25[30],ymm27[31],ymm25[31]
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm26 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
-; AVX512BW-SLOW-NEXT:    vpermw %ymm25, %ymm26, %ymm25
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm25, %zmm20, %zmm25
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm27 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
-; AVX512BW-SLOW-NEXT:    vpshufb %ymm27, %ymm28, %ymm20
-; AVX512BW-SLOW-NEXT:    vpshufb %ymm27, %ymm29, %ymm31
-; AVX512BW-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm20 = ymm31[0],ymm20[0],ymm31[1],ymm20[1],ymm31[2],ymm20[2],ymm31[3],ymm20[3],ymm31[4],ymm20[4],ymm31[5],ymm20[5],ymm31[6],ymm20[6],ymm31[7],ymm20[7],ymm31[16],ymm20[16],ymm31[17],ymm20[17],ymm31[18],ymm20[18],ymm31[19],ymm20[19],ymm31[20],ymm20[20],ymm31[21],ymm20[21],ymm31[22],ymm20[22],ymm31[23],ymm20[23]
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3]
-; AVX512BW-SLOW-NEXT:    vpunpckhbw {{.*#+}} ymm28 = ymm29[8],ymm28[8],ymm29[9],ymm28[9],ymm29[10],ymm28[10],ymm29[11],ymm28[11],ymm29[12],ymm28[12],ymm29[13],ymm28[13],ymm29[14],ymm28[14],ymm29[15],ymm28[15],ymm29[24],ymm28[24],ymm29[25],ymm28[25],ymm29[26],ymm28[26],ymm29[27],ymm28[27],ymm29[28],ymm28[28],ymm29[29],ymm28[29],ymm29[30],ymm28[30],ymm29[31],ymm28[31]
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm29 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
-; AVX512BW-SLOW-NEXT:    vpermw %ymm28, %ymm29, %ymm28
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm28, %zmm20, %zmm20
-; AVX512BW-SLOW-NEXT:    vmovdqu16 %zmm25, %zmm20 {%k1}
-; AVX512BW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm25 = zmm30[0,1,2,3],zmm14[4,5,6,7]
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm28 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u>
-; AVX512BW-SLOW-NEXT:    vpshufb %zmm28, %zmm25, %zmm25
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} zmm25 = zmm25[2,2,2,3,6,6,6,7]
-; AVX512BW-SLOW-NEXT:    movl $-1840700270, %ecx # imm = 0x92492492
-; AVX512BW-SLOW-NEXT:    kmovd %ecx, %k2
-; AVX512BW-SLOW-NEXT:    vmovdqu16 %zmm25, %zmm20 {%k2}
-; AVX512BW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm23[0,1,2,3],zmm12[4,5,6,7]
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15>
-; AVX512BW-SLOW-NEXT:    vpshufb %zmm25, %zmm23, %zmm23
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} zmm23 = zmm23[2,2,2,3,6,6,6,7]
-; AVX512BW-SLOW-NEXT:    movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208
-; AVX512BW-SLOW-NEXT:    kmovq %rcx, %k3
-; AVX512BW-SLOW-NEXT:    vmovdqu8 %zmm23, %zmm20 {%k3}
-; AVX512BW-SLOW-NEXT:    vpshufb %ymm24, %ymm16, %ymm23
-; AVX512BW-SLOW-NEXT:    vpshufb %ymm24, %ymm17, %ymm24
-; AVX512BW-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm23 = ymm24[0],ymm23[0],ymm24[1],ymm23[1],ymm24[2],ymm23[2],ymm24[3],ymm23[3],ymm24[4],ymm23[4],ymm24[5],ymm23[5],ymm24[6],ymm23[6],ymm24[7],ymm23[7],ymm24[16],ymm23[16],ymm24[17],ymm23[17],ymm24[18],ymm23[18],ymm24[19],ymm23[19],ymm24[20],ymm23[20],ymm24[21],ymm23[21],ymm24[22],ymm23[22],ymm24[23],ymm23[23]
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm23 = ymm23[2,2,2,3]
-; AVX512BW-SLOW-NEXT:    vpunpckhbw {{.*#+}} ymm16 = ymm17[8],ymm16[8],ymm17[9],ymm16[9],ymm17[10],ymm16[10],ymm17[11],ymm16[11],ymm17[12],ymm16[12],ymm17[13],ymm16[13],ymm17[14],ymm16[14],ymm17[15],ymm16[15],ymm17[24],ymm16[24],ymm17[25],ymm16[25],ymm17[26],ymm16[26],ymm17[27],ymm16[27],ymm17[28],ymm16[28],ymm17[29],ymm16[29],ymm17[30],ymm16[30],ymm17[31],ymm16[31]
-; AVX512BW-SLOW-NEXT:    vpermw %ymm16, %ymm26, %ymm16
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm16, %zmm23, %zmm17
-; AVX512BW-SLOW-NEXT:    vpshufb %ymm27, %ymm18, %ymm16
-; AVX512BW-SLOW-NEXT:    vpshufb %ymm27, %ymm19, %ymm23
-; AVX512BW-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm16 = ymm23[0],ymm16[0],ymm23[1],ymm16[1],ymm23[2],ymm16[2],ymm23[3],ymm16[3],ymm23[4],ymm16[4],ymm23[5],ymm16[5],ymm23[6],ymm16[6],ymm23[7],ymm16[7],ymm23[16],ymm16[16],ymm23[17],ymm16[17],ymm23[18],ymm16[18],ymm23[19],ymm16[19],ymm23[20],ymm16[20],ymm23[21],ymm16[21],ymm23[22],ymm16[22],ymm23[23],ymm16[23]
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm16 = ymm16[2,2,2,3]
-; AVX512BW-SLOW-NEXT:    vpunpckhbw {{.*#+}} ymm18 = ymm19[8],ymm18[8],ymm19[9],ymm18[9],ymm19[10],ymm18[10],ymm19[11],ymm18[11],ymm19[12],ymm18[12],ymm19[13],ymm18[13],ymm19[14],ymm18[14],ymm19[15],ymm18[15],ymm19[24],ymm18[24],ymm19[25],ymm18[25],ymm19[26],ymm18[26],ymm19[27],ymm18[27],ymm19[28],ymm18[28],ymm19[29],ymm18[29],ymm19[30],ymm18[30],ymm19[31],ymm18[31]
-; AVX512BW-SLOW-NEXT:    vpermw %ymm18, %ymm29, %ymm18
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm18, %zmm16, %zmm16
-; AVX512BW-SLOW-NEXT:    vmovdqu16 %zmm17, %zmm16 {%k1}
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm21, %zmm14, %zmm14
-; AVX512BW-SLOW-NEXT:    vpshufb %zmm28, %zmm14, %zmm14
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} zmm14 = zmm14[2,2,2,3,6,6,6,7]
-; AVX512BW-SLOW-NEXT:    vmovdqu16 %zmm14, %zmm16 {%k2}
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm22, %zmm12, %zmm12
-; AVX512BW-SLOW-NEXT:    vpshufb %zmm25, %zmm12, %zmm12
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} zmm12 = zmm12[2,2,2,3,6,6,6,7]
-; AVX512BW-SLOW-NEXT:    vmovdqu8 %zmm12, %zmm16 {%k3}
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u>
-; AVX512BW-SLOW-NEXT:    vpshufb %xmm12, %xmm9, %xmm14
-; AVX512BW-SLOW-NEXT:    vpshufb %xmm12, %xmm11, %xmm17
-; AVX512BW-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm17[8],xmm14[8],xmm17[9],xmm14[9],xmm17[10],xmm14[10],xmm17[11],xmm14[11],xmm17[12],xmm14[12],xmm17[13],xmm14[13],xmm17[14],xmm14[14],xmm17[15],xmm14[15]
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1]
-; AVX512BW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5]
-; AVX512BW-SLOW-NEXT:    vpermw %ymm9, %ymm11, %ymm9
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm14, %zmm9, %zmm9
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
-; AVX512BW-SLOW-NEXT:    vpshufb %xmm14, %xmm8, %xmm17
-; AVX512BW-SLOW-NEXT:    vpshufb %xmm14, %xmm10, %xmm18
-; AVX512BW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm17 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7]
-; AVX512BW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
-; AVX512BW-SLOW-NEXT:    vprold $16, %xmm8, %xmm8
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm17, %zmm8, %zmm8
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5]
-; AVX512BW-SLOW-NEXT:    vmovdqu16 %zmm8, %zmm9 {%k2}
-; AVX512BW-SLOW-NEXT:    vpmovzxbw {{.*#+}} xmm8 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero
-; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} xmm10 = xmm13[2,1,2,3]
-; AVX512BW-SLOW-NEXT:    vpmovzxbw {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero,xmm10[4],zero,xmm10[5],zero,xmm10[6],zero,xmm10[7],zero
-; AVX512BW-SLOW-NEXT:    vinserti32x4 $2, %xmm10, %zmm8, %zmm8
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,22,21,16,23,22,21,16,23,22,21,16,23,17,17,17,17]
-; AVX512BW-SLOW-NEXT:    vpermw %zmm8, %zmm10, %zmm9 {%k1}
-; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} xmm8 = xmm15[2,1,2,3]
-; AVX512BW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm15[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BW-SLOW-NEXT:    vinserti32x4 $2, %xmm8, %zmm13, %zmm8
-; AVX512BW-SLOW-NEXT:    vpermw %zmm8, %zmm10, %zmm8
-; AVX512BW-SLOW-NEXT:    movabsq $585610922974906400, %rcx # imm = 0x820820820820820
-; AVX512BW-SLOW-NEXT:    kmovq %rcx, %k3
-; AVX512BW-SLOW-NEXT:    vmovdqu8 %zmm8, %zmm9 {%k3}
-; AVX512BW-SLOW-NEXT:    vpshufb %xmm12, %xmm1, %xmm8
-; AVX512BW-SLOW-NEXT:    vpshufb %xmm12, %xmm3, %xmm12
-; AVX512BW-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15]
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1]
-; AVX512BW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX512BW-SLOW-NEXT:    vpermw %ymm1, %ymm11, %ymm1
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm8, %zmm1, %zmm1
-; AVX512BW-SLOW-NEXT:    vpshufb %xmm14, %xmm2, %xmm3
-; AVX512BW-SLOW-NEXT:    vpshufb %xmm14, %xmm4, %xmm8
-; AVX512BW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7]
-; AVX512BW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; AVX512BW-SLOW-NEXT:    vprold $16, %xmm2, %xmm2
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} zmm2 = zmm2[0,0,0,1,4,4,4,5]
-; AVX512BW-SLOW-NEXT:    vmovdqu16 %zmm2, %zmm1 {%k2}
-; AVX512BW-SLOW-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm5[2,1,2,3]
-; AVX512BW-SLOW-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX512BW-SLOW-NEXT:    vinserti32x4 $2, %xmm3, %zmm2, %zmm2
-; AVX512BW-SLOW-NEXT:    vpermw %zmm2, %zmm10, %zmm1 {%k1}
-; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm6[2,1,2,3]
-; AVX512BW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BW-SLOW-NEXT:    vinserti32x4 $2, %xmm2, %zmm3, %zmm2
-; AVX512BW-SLOW-NEXT:    vpermw %zmm2, %zmm10, %zmm2
-; AVX512BW-SLOW-NEXT:    vmovdqu8 %zmm2, %zmm1 {%k3}
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm1, (%rax)
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm9, 192(%rax)
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm16, 128(%rax)
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm20, 320(%rax)
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm7, 256(%rax)
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512BW-SLOW-NEXT:    vzeroupper
-; AVX512BW-SLOW-NEXT:    retq
+; AVX512BW-ONLY-SLOW-LABEL: store_i8_stride6_vf64:
+; AVX512BW-ONLY-SLOW:       # %bb.0:
+; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %ymm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %ymm17
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm17[0],ymm16[0],ymm17[1],ymm16[1],ymm17[2],ymm16[2],ymm17[3],ymm16[3],ymm17[4],ymm16[4],ymm17[5],ymm16[5],ymm17[6],ymm16[6],ymm17[7],ymm16[7],ymm17[16],ymm16[16],ymm17[17],ymm16[17],ymm17[18],ymm16[18],ymm17[19],ymm16[19],ymm17[20],ymm16[20],ymm17[21],ymm16[21],ymm17[22],ymm16[22],ymm17[23],ymm16[23]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 32(%rsi), %xmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %xmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 32(%rdi), %xmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermw %zmm0, %zmm7, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 32(%rcx), %xmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 32(%rdx), %xmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermw %ymm5, %ymm20, %ymm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %ymm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %ymm19
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm6 = ymm19[0],ymm18[0],ymm19[1],ymm18[1],ymm19[2],ymm18[2],ymm19[3],ymm18[3],ymm19[4],ymm18[4],ymm19[5],ymm18[5],ymm19[6],ymm18[6],ymm19[7],ymm18[7],ymm19[16],ymm18[16],ymm19[17],ymm18[17],ymm19[18],ymm18[18],ymm19[19],ymm18[19],ymm19[20],ymm18[20],ymm19[21],ymm18[21],ymm19[22],ymm18[22],ymm19[23],ymm18[23]
+; AVX512BW-ONLY-SLOW-NEXT:    vprold $16, %ymm6, %ymm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    movl $613566756, %r10d # imm = 0x24924924
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %r10d, %k1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu16 %zmm5, %zmm0 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%r8), %xmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 32(%r8), %xmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm23 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermw %ymm6, %ymm23, %ymm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %ymm21
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm24 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm24 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %ymm24, %ymm21, %ymm15
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm15, %zmm6, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    movl $1227133513, %r10d # imm = 0x49249249
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %r10d, %k2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu16 %zmm6, %zmm0 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%r9), %xmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 32(%r9), %xmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm22 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermw %ymm22, %ymm23, %ymm25
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %ymm22
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm26 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %ymm26, %ymm22, %ymm27
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm27 = ymm27[2,2,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm27, %zmm25, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082
+; AVX512BW-ONLY-SLOW-NEXT:    kmovq %r10, %k3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu8 %zmm25, %zmm0 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 32(%rsi), %ymm25
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 32(%rdi), %ymm27
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm28 = ymm27[0],ymm25[0],ymm27[1],ymm25[1],ymm27[2],ymm25[2],ymm27[3],ymm25[3],ymm27[4],ymm25[4],ymm27[5],ymm25[5],ymm27[6],ymm25[6],ymm27[7],ymm25[7],ymm27[16],ymm25[16],ymm27[17],ymm25[17],ymm27[18],ymm25[18],ymm27[19],ymm25[19],ymm27[20],ymm25[20],ymm27[21],ymm25[21],ymm27[22],ymm25[22],ymm27[23],ymm25[23]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm29 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm28, %zmm29, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vpermw %zmm28, %zmm7, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm28 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermw %ymm28, %ymm20, %ymm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 32(%rcx), %ymm28
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 32(%rdx), %ymm29
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm30 = ymm29[0],ymm28[0],ymm29[1],ymm28[1],ymm29[2],ymm28[2],ymm29[3],ymm28[3],ymm29[4],ymm28[4],ymm29[5],ymm28[5],ymm29[6],ymm28[6],ymm29[7],ymm28[7],ymm29[16],ymm28[16],ymm29[17],ymm28[17],ymm29[18],ymm28[18],ymm29[19],ymm28[19],ymm29[20],ymm28[20],ymm29[21],ymm28[21],ymm29[22],ymm28[22],ymm29[23],ymm28[23]
+; AVX512BW-ONLY-SLOW-NEXT:    vprold $16, %ymm30, %ymm30
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm30 = ymm30[2,2,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm30, %zmm20, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu16 %zmm20, %zmm7 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermw %ymm20, %ymm23, %ymm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 32(%r8), %ymm30
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %ymm24, %ymm30, %ymm24
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm24 = ymm24[2,2,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm24, %zmm20, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu16 %zmm20, %zmm7 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm15[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermw %ymm20, %ymm23, %ymm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 32(%r9), %ymm23
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %ymm26, %ymm23, %ymm24
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm24 = ymm24[2,2,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm24, %zmm20, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu8 %zmm20, %zmm7 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm24 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %ymm24, %ymm25, %ymm20
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %ymm24, %ymm27, %ymm26
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm20 = ymm26[0],ymm20[0],ymm26[1],ymm20[1],ymm26[2],ymm20[2],ymm26[3],ymm20[3],ymm26[4],ymm20[4],ymm26[5],ymm20[5],ymm26[6],ymm20[6],ymm26[7],ymm20[7],ymm26[16],ymm20[16],ymm26[17],ymm20[17],ymm26[18],ymm20[18],ymm26[19],ymm20[19],ymm26[20],ymm20[20],ymm26[21],ymm20[21],ymm26[22],ymm20[22],ymm26[23],ymm20[23]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhbw {{.*#+}} ymm25 = ymm27[8],ymm25[8],ymm27[9],ymm25[9],ymm27[10],ymm25[10],ymm27[11],ymm25[11],ymm27[12],ymm25[12],ymm27[13],ymm25[13],ymm27[14],ymm25[14],ymm27[15],ymm25[15],ymm27[24],ymm25[24],ymm27[25],ymm25[25],ymm27[26],ymm25[26],ymm27[27],ymm25[27],ymm27[28],ymm25[28],ymm27[29],ymm25[29],ymm27[30],ymm25[30],ymm27[31],ymm25[31]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm26 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermw %ymm25, %ymm26, %ymm25
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm25, %zmm20, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm27 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %ymm27, %ymm28, %ymm20
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %ymm27, %ymm29, %ymm31
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm20 = ymm31[0],ymm20[0],ymm31[1],ymm20[1],ymm31[2],ymm20[2],ymm31[3],ymm20[3],ymm31[4],ymm20[4],ymm31[5],ymm20[5],ymm31[6],ymm20[6],ymm31[7],ymm20[7],ymm31[16],ymm20[16],ymm31[17],ymm20[17],ymm31[18],ymm20[18],ymm31[19],ymm20[19],ymm31[20],ymm20[20],ymm31[21],ymm20[21],ymm31[22],ymm20[22],ymm31[23],ymm20[23]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhbw {{.*#+}} ymm28 = ymm29[8],ymm28[8],ymm29[9],ymm28[9],ymm29[10],ymm28[10],ymm29[11],ymm28[11],ymm29[12],ymm28[12],ymm29[13],ymm28[13],ymm29[14],ymm28[14],ymm29[15],ymm28[15],ymm29[24],ymm28[24],ymm29[25],ymm28[25],ymm29[26],ymm28[26],ymm29[27],ymm28[27],ymm29[28],ymm28[28],ymm29[29],ymm28[29],ymm29[30],ymm28[30],ymm29[31],ymm28[31]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm29 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermw %ymm28, %ymm29, %ymm28
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm28, %zmm20, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu16 %zmm25, %zmm20 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm25 = zmm30[0,1,2,3],zmm14[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm28 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u>
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %zmm28, %zmm25, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} zmm25 = zmm25[2,2,2,3,6,6,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    movl $-1840700270, %ecx # imm = 0x92492492
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %ecx, %k2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu16 %zmm25, %zmm20 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm23[0,1,2,3],zmm12[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15>
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %zmm25, %zmm23, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} zmm23 = zmm23[2,2,2,3,6,6,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208
+; AVX512BW-ONLY-SLOW-NEXT:    kmovq %rcx, %k3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu8 %zmm23, %zmm20 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %ymm24, %ymm16, %ymm23
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %ymm24, %ymm17, %ymm24
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm23 = ymm24[0],ymm23[0],ymm24[1],ymm23[1],ymm24[2],ymm23[2],ymm24[3],ymm23[3],ymm24[4],ymm23[4],ymm24[5],ymm23[5],ymm24[6],ymm23[6],ymm24[7],ymm23[7],ymm24[16],ymm23[16],ymm24[17],ymm23[17],ymm24[18],ymm23[18],ymm24[19],ymm23[19],ymm24[20],ymm23[20],ymm24[21],ymm23[21],ymm24[22],ymm23[22],ymm24[23],ymm23[23]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm23 = ymm23[2,2,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhbw {{.*#+}} ymm16 = ymm17[8],ymm16[8],ymm17[9],ymm16[9],ymm17[10],ymm16[10],ymm17[11],ymm16[11],ymm17[12],ymm16[12],ymm17[13],ymm16[13],ymm17[14],ymm16[14],ymm17[15],ymm16[15],ymm17[24],ymm16[24],ymm17[25],ymm16[25],ymm17[26],ymm16[26],ymm17[27],ymm16[27],ymm17[28],ymm16[28],ymm17[29],ymm16[29],ymm17[30],ymm16[30],ymm17[31],ymm16[31]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermw %ymm16, %ymm26, %ymm16
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm16, %zmm23, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %ymm27, %ymm18, %ymm16
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %ymm27, %ymm19, %ymm23
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm16 = ymm23[0],ymm16[0],ymm23[1],ymm16[1],ymm23[2],ymm16[2],ymm23[3],ymm16[3],ymm23[4],ymm16[4],ymm23[5],ymm16[5],ymm23[6],ymm16[6],ymm23[7],ymm16[7],ymm23[16],ymm16[16],ymm23[17],ymm16[17],ymm23[18],ymm16[18],ymm23[19],ymm16[19],ymm23[20],ymm16[20],ymm23[21],ymm16[21],ymm23[22],ymm16[22],ymm23[23],ymm16[23]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm16 = ymm16[2,2,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhbw {{.*#+}} ymm18 = ymm19[8],ymm18[8],ymm19[9],ymm18[9],ymm19[10],ymm18[10],ymm19[11],ymm18[11],ymm19[12],ymm18[12],ymm19[13],ymm18[13],ymm19[14],ymm18[14],ymm19[15],ymm18[15],ymm19[24],ymm18[24],ymm19[25],ymm18[25],ymm19[26],ymm18[26],ymm19[27],ymm18[27],ymm19[28],ymm18[28],ymm19[29],ymm18[29],ymm19[30],ymm18[30],ymm19[31],ymm18[31]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermw %ymm18, %ymm29, %ymm18
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm18, %zmm16, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu16 %zmm17, %zmm16 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm21, %zmm14, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %zmm28, %zmm14, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} zmm14 = zmm14[2,2,2,3,6,6,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu16 %zmm14, %zmm16 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm22, %zmm12, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %zmm25, %zmm12, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} zmm12 = zmm12[2,2,2,3,6,6,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu8 %zmm12, %zmm16 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %xmm12, %xmm9, %xmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %xmm12, %xmm11, %xmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm17[8],xmm14[8],xmm17[9],xmm14[9],xmm17[10],xmm14[10],xmm17[11],xmm14[11],xmm17[12],xmm14[12],xmm17[13],xmm14[13],xmm17[14],xmm14[14],xmm17[15],xmm14[15]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermw %ymm9, %ymm11, %ymm9
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm14, %zmm9, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm14 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %xmm14, %xmm8, %xmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %xmm14, %xmm10, %xmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm17 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vprold $16, %xmm8, %xmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm17, %zmm8, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu16 %zmm8, %zmm9 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vpmovzxbw {{.*#+}} xmm8 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufd {{.*#+}} xmm10 = xmm13[2,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpmovzxbw {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero,xmm10[4],zero,xmm10[5],zero,xmm10[6],zero,xmm10[7],zero
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $2, %xmm10, %zmm8, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,22,21,16,23,22,21,16,23,22,21,16,23,17,17,17,17]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermw %zmm8, %zmm10, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufd {{.*#+}} xmm8 = xmm15[2,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm15[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $2, %xmm8, %zmm13, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermw %zmm8, %zmm10, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    movabsq $585610922974906400, %rcx # imm = 0x820820820820820
+; AVX512BW-ONLY-SLOW-NEXT:    kmovq %rcx, %k3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu8 %zmm8, %zmm9 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %xmm12, %xmm1, %xmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %xmm12, %xmm3, %xmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermw %ymm1, %ymm11, %ymm1
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm8, %zmm1, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %xmm14, %xmm2, %xmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufb %xmm14, %xmm4, %xmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vprold $16, %xmm2, %xmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} zmm2 = zmm2[0,0,0,1,4,4,4,5]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu16 %zmm2, %zmm1 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm5[2,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $2, %xmm3, %zmm2, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermw %zmm2, %zmm10, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm6[2,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $2, %xmm2, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermw %zmm2, %zmm10, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu8 %zmm2, %zmm1 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, 320(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, 256(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vzeroupper
+; AVX512BW-ONLY-SLOW-NEXT:    retq
 ;
 ; AVX512BW-FAST-LABEL: store_i8_stride6_vf64:
 ; AVX512BW-FAST:       # %bb.0:
 ; AVX512BW-FAST-NEXT:    vmovdqa64 (%r8), %zmm9
 ; AVX512BW-FAST-NEXT:    vmovdqa64 (%r9), %zmm10
 ; AVX512BW-FAST-NEXT:    vmovdqa 32(%rsi), %ymm1
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm7, %ymm1, %ymm0
 ; AVX512BW-FAST-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm7, %ymm2, %ymm3
@@ -4824,7 +4871,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FAST-NEXT:    vpermw %ymm3, %ymm8, %ymm3
 ; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm3
 ; AVX512BW-FAST-NEXT:    vmovdqa 32(%rcx), %ymm5
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm12, %ymm5, %ymm0
 ; AVX512BW-FAST-NEXT:    vmovdqa 32(%rdx), %ymm6
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm12, %ymm6, %ymm4
@@ -4884,7 +4931,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FAST-NEXT:    vmovdqu8 %zmm10, %zmm7 {%k3}
 ; AVX512BW-FAST-NEXT:    vmovdqa64 (%rcx), %xmm21
 ; AVX512BW-FAST-NEXT:    vmovdqa 32(%rcx), %xmm12
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} xmm23 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm23 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm23, %xmm12, %xmm10
 ; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdx), %xmm22
 ; AVX512BW-FAST-NEXT:    vmovdqa 32(%rdx), %xmm14
@@ -4896,7 +4943,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FAST-NEXT:    vpermw %ymm15, %ymm24, %ymm15
 ; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm10, %zmm15, %zmm16
 ; AVX512BW-FAST-NEXT:    vmovdqa 32(%rsi), %xmm15
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} xmm25 = <u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u>
+; AVX512BW-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm25 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm25, %xmm15, %xmm10
 ; AVX512BW-FAST-NEXT:    vmovdqa64 32(%rdi), %xmm19
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm25, %xmm19, %xmm20
@@ -4972,7 +5019,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FAST-NEXT:    vpermw %zmm5, %zmm18, %zmm1 {%k1}
 ; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7]
 ; AVX512BW-FAST-NEXT:    vpermw %ymm2, %ymm5, %ymm2
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u>
+; AVX512BW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
+; AVX512BW-FAST-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm6, %ymm8, %ymm8
 ; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
 ; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm8, %zmm2, %zmm2
@@ -4981,7 +5029,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FAST-NEXT:    vmovdqu16 %zmm2, %zmm11 {%k1}
 ; AVX512BW-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm26[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; AVX512BW-FAST-NEXT:    vpermw %ymm2, %ymm5, %ymm2
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u>
+; AVX512BW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
+; AVX512BW-FAST-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm8, %ymm9, %ymm9
 ; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
 ; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm9, %zmm2, %zmm2
@@ -5009,6 +5058,217 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, 320(%rax)
 ; AVX512BW-FAST-NEXT:    vzeroupper
 ; AVX512BW-FAST-NEXT:    retq
+;
+; AVX512DQBW-SLOW-LABEL: store_i8_stride6_vf64:
+; AVX512DQBW-SLOW:       # %bb.0:
+; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r8), %zmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r9), %zmm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rsi), %ymm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdi), %ymm17
+; AVX512DQBW-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm17[0],ymm16[0],ymm17[1],ymm16[1],ymm17[2],ymm16[2],ymm17[3],ymm16[3],ymm17[4],ymm16[4],ymm17[5],ymm16[5],ymm17[6],ymm16[6],ymm17[7],ymm16[7],ymm17[16],ymm16[16],ymm17[17],ymm16[17],ymm17[18],ymm16[18],ymm17[19],ymm16[19],ymm17[20],ymm16[20],ymm17[21],ymm16[21],ymm17[22],ymm16[22],ymm17[23],ymm16[23]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 32(%rsi), %xmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdi), %xmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 32(%rdi), %xmm11
+; AVX512DQBW-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29]
+; AVX512DQBW-SLOW-NEXT:    vpermw %zmm0, %zmm7, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 32(%rcx), %xmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdx), %xmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 32(%rdx), %xmm10
+; AVX512DQBW-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7]
+; AVX512DQBW-SLOW-NEXT:    vpermw %ymm5, %ymm20, %ymm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rcx), %ymm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdx), %ymm19
+; AVX512DQBW-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm6 = ymm19[0],ymm18[0],ymm19[1],ymm18[1],ymm19[2],ymm18[2],ymm19[3],ymm18[3],ymm19[4],ymm18[4],ymm19[5],ymm18[5],ymm19[6],ymm18[6],ymm19[7],ymm18[7],ymm19[16],ymm18[16],ymm19[17],ymm18[17],ymm19[18],ymm18[18],ymm19[19],ymm18[19],ymm19[20],ymm18[20],ymm19[21],ymm18[21],ymm19[22],ymm18[22],ymm19[23],ymm18[23]
+; AVX512DQBW-SLOW-NEXT:    vprold $16, %ymm6, %ymm6
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm5
+; AVX512DQBW-SLOW-NEXT:    movl $613566756, %r10d # imm = 0x24924924
+; AVX512DQBW-SLOW-NEXT:    kmovd %r10d, %k1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu16 %zmm5, %zmm0 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%r8), %xmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 32(%r8), %xmm13
+; AVX512DQBW-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm23 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vpermw %ymm6, %ymm23, %ymm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r8), %ymm21
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm24 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
+; AVX512DQBW-SLOW-NEXT:    # ymm24 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vpshufb %ymm24, %ymm21, %ymm15
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm15, %zmm6, %zmm6
+; AVX512DQBW-SLOW-NEXT:    movl $1227133513, %r10d # imm = 0x49249249
+; AVX512DQBW-SLOW-NEXT:    kmovd %r10d, %k2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu16 %zmm6, %zmm0 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%r9), %xmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 32(%r9), %xmm15
+; AVX512DQBW-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm22 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQBW-SLOW-NEXT:    vpermw %ymm22, %ymm23, %ymm25
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r9), %ymm22
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm26 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
+; AVX512DQBW-SLOW-NEXT:    # ymm26 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vpshufb %ymm26, %ymm22, %ymm27
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm27 = ymm27[2,2,2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm27, %zmm25, %zmm25
+; AVX512DQBW-SLOW-NEXT:    movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082
+; AVX512DQBW-SLOW-NEXT:    kmovq %r10, %k3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu8 %zmm25, %zmm0 {%k3}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 32(%rsi), %ymm25
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 32(%rdi), %ymm27
+; AVX512DQBW-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm28 = ymm27[0],ymm25[0],ymm27[1],ymm25[1],ymm27[2],ymm25[2],ymm27[3],ymm25[3],ymm27[4],ymm25[4],ymm27[5],ymm25[5],ymm27[6],ymm25[6],ymm27[7],ymm25[7],ymm27[16],ymm25[16],ymm27[17],ymm25[17],ymm27[18],ymm25[18],ymm27[19],ymm25[19],ymm27[20],ymm25[20],ymm27[21],ymm25[21],ymm27[22],ymm25[22],ymm27[23],ymm25[23]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm29 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm28, %zmm29, %zmm28
+; AVX512DQBW-SLOW-NEXT:    vpermw %zmm28, %zmm7, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm28 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15]
+; AVX512DQBW-SLOW-NEXT:    vpermw %ymm28, %ymm20, %ymm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 32(%rcx), %ymm28
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 32(%rdx), %ymm29
+; AVX512DQBW-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm30 = ymm29[0],ymm28[0],ymm29[1],ymm28[1],ymm29[2],ymm28[2],ymm29[3],ymm28[3],ymm29[4],ymm28[4],ymm29[5],ymm28[5],ymm29[6],ymm28[6],ymm29[7],ymm28[7],ymm29[16],ymm28[16],ymm29[17],ymm28[17],ymm29[18],ymm28[18],ymm29[19],ymm28[19],ymm29[20],ymm28[20],ymm29[21],ymm28[21],ymm29[22],ymm28[22],ymm29[23],ymm28[23]
+; AVX512DQBW-SLOW-NEXT:    vprold $16, %ymm30, %ymm30
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm30 = ymm30[2,2,2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm30, %zmm20, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqu16 %zmm20, %zmm7 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQBW-SLOW-NEXT:    vpermw %ymm20, %ymm23, %ymm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 32(%r8), %ymm30
+; AVX512DQBW-SLOW-NEXT:    vpshufb %ymm24, %ymm30, %ymm24
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm24 = ymm24[2,2,2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm24, %zmm20, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqu16 %zmm20, %zmm7 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm15[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQBW-SLOW-NEXT:    vpermw %ymm20, %ymm23, %ymm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 32(%r9), %ymm23
+; AVX512DQBW-SLOW-NEXT:    vpshufb %ymm26, %ymm23, %ymm24
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm24 = ymm24[2,2,2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm24, %zmm20, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqu8 %zmm20, %zmm7 {%k3}
+; AVX512DQBW-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm24 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512DQBW-SLOW-NEXT:    vpshufb %ymm24, %ymm25, %ymm20
+; AVX512DQBW-SLOW-NEXT:    vpshufb %ymm24, %ymm27, %ymm26
+; AVX512DQBW-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm20 = ymm26[0],ymm20[0],ymm26[1],ymm20[1],ymm26[2],ymm20[2],ymm26[3],ymm20[3],ymm26[4],ymm20[4],ymm26[5],ymm20[5],ymm26[6],ymm20[6],ymm26[7],ymm20[7],ymm26[16],ymm20[16],ymm26[17],ymm20[17],ymm26[18],ymm20[18],ymm26[19],ymm20[19],ymm26[20],ymm20[20],ymm26[21],ymm20[21],ymm26[22],ymm20[22],ymm26[23],ymm20[23]
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhbw {{.*#+}} ymm25 = ymm27[8],ymm25[8],ymm27[9],ymm25[9],ymm27[10],ymm25[10],ymm27[11],ymm25[11],ymm27[12],ymm25[12],ymm27[13],ymm25[13],ymm27[14],ymm25[14],ymm27[15],ymm25[15],ymm27[24],ymm25[24],ymm27[25],ymm25[25],ymm27[26],ymm25[26],ymm27[27],ymm25[27],ymm27[28],ymm25[28],ymm27[29],ymm25[29],ymm27[30],ymm25[30],ymm27[31],ymm25[31]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm26 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
+; AVX512DQBW-SLOW-NEXT:    vpermw %ymm25, %ymm26, %ymm25
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm25, %zmm20, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm27 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512DQBW-SLOW-NEXT:    vpshufb %ymm27, %ymm28, %ymm20
+; AVX512DQBW-SLOW-NEXT:    vpshufb %ymm27, %ymm29, %ymm31
+; AVX512DQBW-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm20 = ymm31[0],ymm20[0],ymm31[1],ymm20[1],ymm31[2],ymm20[2],ymm31[3],ymm20[3],ymm31[4],ymm20[4],ymm31[5],ymm20[5],ymm31[6],ymm20[6],ymm31[7],ymm20[7],ymm31[16],ymm20[16],ymm31[17],ymm20[17],ymm31[18],ymm20[18],ymm31[19],ymm20[19],ymm31[20],ymm20[20],ymm31[21],ymm20[21],ymm31[22],ymm20[22],ymm31[23],ymm20[23]
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhbw {{.*#+}} ymm28 = ymm29[8],ymm28[8],ymm29[9],ymm28[9],ymm29[10],ymm28[10],ymm29[11],ymm28[11],ymm29[12],ymm28[12],ymm29[13],ymm28[13],ymm29[14],ymm28[14],ymm29[15],ymm28[15],ymm29[24],ymm28[24],ymm29[25],ymm28[25],ymm29[26],ymm28[26],ymm29[27],ymm28[27],ymm29[28],ymm28[28],ymm29[29],ymm28[29],ymm29[30],ymm28[30],ymm29[31],ymm28[31]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm29 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
+; AVX512DQBW-SLOW-NEXT:    vpermw %ymm28, %ymm29, %ymm28
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm28, %zmm20, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqu16 %zmm25, %zmm20 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm25 = zmm30[0,1,2,3],zmm14[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm28 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u>
+; AVX512DQBW-SLOW-NEXT:    vpshufb %zmm28, %zmm25, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} zmm25 = zmm25[2,2,2,3,6,6,6,7]
+; AVX512DQBW-SLOW-NEXT:    movl $-1840700270, %ecx # imm = 0x92492492
+; AVX512DQBW-SLOW-NEXT:    kmovd %ecx, %k2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu16 %zmm25, %zmm20 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm23[0,1,2,3],zmm12[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15>
+; AVX512DQBW-SLOW-NEXT:    vpshufb %zmm25, %zmm23, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} zmm23 = zmm23[2,2,2,3,6,6,6,7]
+; AVX512DQBW-SLOW-NEXT:    movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208
+; AVX512DQBW-SLOW-NEXT:    kmovq %rcx, %k3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu8 %zmm23, %zmm20 {%k3}
+; AVX512DQBW-SLOW-NEXT:    vpshufb %ymm24, %ymm16, %ymm23
+; AVX512DQBW-SLOW-NEXT:    vpshufb %ymm24, %ymm17, %ymm24
+; AVX512DQBW-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm23 = ymm24[0],ymm23[0],ymm24[1],ymm23[1],ymm24[2],ymm23[2],ymm24[3],ymm23[3],ymm24[4],ymm23[4],ymm24[5],ymm23[5],ymm24[6],ymm23[6],ymm24[7],ymm23[7],ymm24[16],ymm23[16],ymm24[17],ymm23[17],ymm24[18],ymm23[18],ymm24[19],ymm23[19],ymm24[20],ymm23[20],ymm24[21],ymm23[21],ymm24[22],ymm23[22],ymm24[23],ymm23[23]
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm23 = ymm23[2,2,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhbw {{.*#+}} ymm16 = ymm17[8],ymm16[8],ymm17[9],ymm16[9],ymm17[10],ymm16[10],ymm17[11],ymm16[11],ymm17[12],ymm16[12],ymm17[13],ymm16[13],ymm17[14],ymm16[14],ymm17[15],ymm16[15],ymm17[24],ymm16[24],ymm17[25],ymm16[25],ymm17[26],ymm16[26],ymm17[27],ymm16[27],ymm17[28],ymm16[28],ymm17[29],ymm16[29],ymm17[30],ymm16[30],ymm17[31],ymm16[31]
+; AVX512DQBW-SLOW-NEXT:    vpermw %ymm16, %ymm26, %ymm16
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm16, %zmm23, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vpshufb %ymm27, %ymm18, %ymm16
+; AVX512DQBW-SLOW-NEXT:    vpshufb %ymm27, %ymm19, %ymm23
+; AVX512DQBW-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm16 = ymm23[0],ymm16[0],ymm23[1],ymm16[1],ymm23[2],ymm16[2],ymm23[3],ymm16[3],ymm23[4],ymm16[4],ymm23[5],ymm16[5],ymm23[6],ymm16[6],ymm23[7],ymm16[7],ymm23[16],ymm16[16],ymm23[17],ymm16[17],ymm23[18],ymm16[18],ymm23[19],ymm16[19],ymm23[20],ymm16[20],ymm23[21],ymm16[21],ymm23[22],ymm16[22],ymm23[23],ymm16[23]
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm16 = ymm16[2,2,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhbw {{.*#+}} ymm18 = ymm19[8],ymm18[8],ymm19[9],ymm18[9],ymm19[10],ymm18[10],ymm19[11],ymm18[11],ymm19[12],ymm18[12],ymm19[13],ymm18[13],ymm19[14],ymm18[14],ymm19[15],ymm18[15],ymm19[24],ymm18[24],ymm19[25],ymm18[25],ymm19[26],ymm18[26],ymm19[27],ymm18[27],ymm19[28],ymm18[28],ymm19[29],ymm18[29],ymm19[30],ymm18[30],ymm19[31],ymm18[31]
+; AVX512DQBW-SLOW-NEXT:    vpermw %ymm18, %ymm29, %ymm18
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm18, %zmm16, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqu16 %zmm17, %zmm16 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm21, %zmm14, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vpshufb %zmm28, %zmm14, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} zmm14 = zmm14[2,2,2,3,6,6,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu16 %zmm14, %zmm16 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm22, %zmm12, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vpshufb %zmm25, %zmm12, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} zmm12 = zmm12[2,2,2,3,6,6,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu8 %zmm12, %zmm16 {%k3}
+; AVX512DQBW-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512DQBW-SLOW-NEXT:    vpshufb %xmm12, %xmm9, %xmm14
+; AVX512DQBW-SLOW-NEXT:    vpshufb %xmm12, %xmm11, %xmm17
+; AVX512DQBW-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm17[8],xmm14[8],xmm17[9],xmm14[9],xmm17[10],xmm14[10],xmm17[11],xmm14[11],xmm17[12],xmm14[12],xmm17[13],xmm14[13],xmm17[14],xmm14[14],xmm17[15],xmm14[15]
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1]
+; AVX512DQBW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5]
+; AVX512DQBW-SLOW-NEXT:    vpermw %ymm9, %ymm11, %ymm9
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm14, %zmm9, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm14 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512DQBW-SLOW-NEXT:    vpshufb %xmm14, %xmm8, %xmm17
+; AVX512DQBW-SLOW-NEXT:    vpshufb %xmm14, %xmm10, %xmm18
+; AVX512DQBW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm17 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7]
+; AVX512DQBW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
+; AVX512DQBW-SLOW-NEXT:    vprold $16, %xmm8, %xmm8
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm17, %zmm8, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu16 %zmm8, %zmm9 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vpmovzxbw {{.*#+}} xmm8 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero
+; AVX512DQBW-SLOW-NEXT:    vpshufd {{.*#+}} xmm10 = xmm13[2,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpmovzxbw {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero,xmm10[4],zero,xmm10[5],zero,xmm10[6],zero,xmm10[7],zero
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $2, %xmm10, %zmm8, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,22,21,16,23,22,21,16,23,22,21,16,23,17,17,17,17]
+; AVX512DQBW-SLOW-NEXT:    vpermw %zmm8, %zmm10, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpshufd {{.*#+}} xmm8 = xmm15[2,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQBW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm15[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $2, %xmm8, %zmm13, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermw %zmm8, %zmm10, %zmm8
+; AVX512DQBW-SLOW-NEXT:    movabsq $585610922974906400, %rcx # imm = 0x820820820820820
+; AVX512DQBW-SLOW-NEXT:    kmovq %rcx, %k3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu8 %zmm8, %zmm9 {%k3}
+; AVX512DQBW-SLOW-NEXT:    vpshufb %xmm12, %xmm1, %xmm8
+; AVX512DQBW-SLOW-NEXT:    vpshufb %xmm12, %xmm3, %xmm12
+; AVX512DQBW-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15]
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1]
+; AVX512DQBW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; AVX512DQBW-SLOW-NEXT:    vpermw %ymm1, %ymm11, %ymm1
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm8, %zmm1, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpshufb %xmm14, %xmm2, %xmm3
+; AVX512DQBW-SLOW-NEXT:    vpshufb %xmm14, %xmm4, %xmm8
+; AVX512DQBW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7]
+; AVX512DQBW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; AVX512DQBW-SLOW-NEXT:    vprold $16, %xmm2, %xmm2
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} zmm2 = zmm2[0,0,0,1,4,4,4,5]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu16 %zmm2, %zmm1 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
+; AVX512DQBW-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm5[2,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $2, %xmm3, %zmm2, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermw %zmm2, %zmm10, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm6[2,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQBW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $2, %xmm2, %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermw %zmm2, %zmm10, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu8 %zmm2, %zmm1 {%k3}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, 192(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, 128(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, 320(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, 256(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, 64(%rax)
+; AVX512DQBW-SLOW-NEXT:    vzeroupper
+; AVX512DQBW-SLOW-NEXT:    retq
   %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64
   %in.vec1 = load <64 x i8>, ptr %in.vecptr1, align 64
   %in.vec2 = load <64 x i8>, ptr %in.vecptr2, align 64
@@ -5030,11 +5290,9 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2: {{.*}}
 ; AVX512: {{.*}}
 ; AVX512BW-ONLY-FAST: {{.*}}
-; AVX512BW-ONLY-SLOW: {{.*}}
 ; AVX512DQ-FAST: {{.*}}
 ; AVX512DQ-SLOW: {{.*}}
 ; AVX512DQBW-FAST: {{.*}}
-; AVX512DQBW-SLOW: {{.*}}
 ; AVX512F-ONLY-FAST: {{.*}}
 ; AVX512F-ONLY-SLOW: {{.*}}
 ; FALLBACK0: {{.*}}

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
index 844dc41240166..ac61b2bf50c34 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
@@ -2802,13 +2802,15 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[0,2,3,3,4,6,7,7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <255,u,u,u,0,255,0,255,u,u,u,0,255,0,255,u,255,u,u,u,0,255,0,255,u,u,u,0,255,0,255,u>
+; AVX2-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [255,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0]
+; AVX2-SLOW-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,255,0,255,0,u,u,u,255,0,255,0,u,u,u,u,u,255,0,255,0,u,u,u,255,0,255,0,u,u,u>
+; AVX2-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0]
+; AVX2-SLOW-NEXT:    # ymm10 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm10, %ymm9, %ymm8, %ymm8
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u>
@@ -3392,7 +3394,8 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm8
 ; AVX512F-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u]
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800]
+; AVX512F-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800]
+; AVX512F-SLOW-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512F-SLOW-NEXT:    vpand %ymm0, %ymm9, %ymm0
 ; AVX512F-SLOW-NEXT:    vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm2[18,19,20,21],zero,ymm2[19],zero,ymm2[25,26,27,22],zero,ymm2[20],zero
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3]
@@ -3406,7 +3409,8 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm14[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm10 = ymm10[0,0,1,1,4,4,5,5]
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655]
+; AVX512F-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655]
+; AVX512F-SLOW-NEXT:    # ymm11 = mem[0,1,0,1]
 ; AVX512F-SLOW-NEXT:    vpand %ymm11, %ymm10, %ymm10
 ; AVX512F-SLOW-NEXT:    vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm2[23],zero,ymm2[21,22,23,26],zero,ymm2[24],zero,ymm2[28,29,26,27]
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3]
@@ -3545,7 +3549,8 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm8
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u]
 ; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800]
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800]
+; AVX512F-FAST-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vpand %ymm0, %ymm9, %ymm0
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm10 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm5[18,19,20,21],zero,ymm5[19],zero,ymm5[25,26,27,22],zero,ymm5[20],zero
 ; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3]
@@ -3575,7 +3580,8 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7]
 ; AVX512F-FAST-NEXT:    vporq %zmm0, %zmm11, %zmm0
 ; AVX512F-FAST-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm13[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,5,4,u,5,u,4,u>
+; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [5,5,4,0,5,5,4,0]
+; AVX512F-FAST-NEXT:    # ymm12 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vpermd %ymm11, %ymm12, %ymm11
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm12 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
 ; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3]
@@ -6126,7 +6132,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm6, %ymm1, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22>
+; AVX2-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
+; AVX2-SLOW-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
@@ -6158,7 +6165,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm11[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,1,1,4,4,5,5]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,5,u,3,u,u,u,u,6,u,4,u,u,u,u,u,u,5,u,3,u,u,u,u,6,u,4,u,u>
+; AVX2-SLOW-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6]
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm13, %ymm4
 ; AVX2-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0]
@@ -6170,7 +6177,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[0,0,1,1,4,4,5,5]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20>
+; AVX2-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
+; AVX2-SLOW-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %ymm4, %ymm10, %ymm5
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3]
@@ -6626,7 +6634,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm9, %ymm6, %ymm8, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
+; AVX2-FAST-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm9, %ymm0, %ymm8
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
@@ -6635,7 +6644,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpshufb %ymm10, %ymm0, %ymm11
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
 ; AVX2-FAST-NEXT:    vpor %ymm8, %ymm11, %ymm8
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
+; AVX2-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm11, %ymm2, %ymm12
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3]
@@ -6658,7 +6668,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
 ; AVX2-FAST-NEXT:    vpor %ymm10, %ymm11, %ymm10
 ; AVX2-FAST-NEXT:    vpblendvb %ymm14, %ymm9, %ymm10, %ymm9
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22>
+; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
+; AVX2-FAST-NEXT:    # ymm10 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm10, %ymm15, %ymm11
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
@@ -7052,7 +7063,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
+; AVX2-FAST-PERLANE-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm8, %ymm0, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3]
@@ -7061,7 +7073,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm9, %ymm6, %ymm10
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm7, %ymm10, %ymm7
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
+; AVX2-FAST-PERLANE-NEXT:    # ymm10 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm10, %ymm2, %ymm11
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
@@ -7084,7 +7097,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm9, %ymm10, %ymm9
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm13, %ymm8, %ymm9, %ymm8
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
+; AVX2-FAST-PERLANE-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm9, %ymm14, %ymm10
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3]
@@ -7093,7 +7107,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm11, %ymm5, %ymm12
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm10, %ymm12, %ymm10
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31>
+; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31]
+; AVX2-FAST-PERLANE-NEXT:    # ymm12 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm12, %ymm4, %ymm13
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2]
@@ -7216,7 +7231,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm2, %ymm23
 ; AVX512F-SLOW-NEXT:    vmovdqa (%r9), %ymm1
 ; AVX512F-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <13,u,u,u,u,u,128,14,u,u,u,u,u,128,15,u,u,u,u,u,128,16,u,u,u,u,u,128,17,u,u,u>
+; AVX512F-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0]
+; AVX512F-SLOW-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm3, %ymm29
 ; AVX512F-SLOW-NEXT:    vporq %ymm0, %ymm1, %ymm24
@@ -7231,7 +7247,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%rdx), %ymm9
 ; AVX512F-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm9[30],zero,ymm9[28],zero,zero,zero,zero,ymm9[31],zero,ymm9[29],zero,zero
 ; AVX512F-SLOW-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128>
+; AVX512F-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
+; AVX512F-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm1, %ymm8, %ymm2
 ; AVX512F-SLOW-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
@@ -7239,7 +7256,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%rsi), %ymm11
 ; AVX512F-SLOW-NEXT:    vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero,zero
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%rdi), %ymm6
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25>
+; AVX512F-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
+; AVX512F-SLOW-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm4, %ymm6, %ymm3
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -7361,7 +7379,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm26, %ymm0
 ; AVX512F-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29],zero,zero
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm0, %ymm30
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14]
 ; AVX512F-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm0, %ymm6, %ymm12
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm0, %ymm28
@@ -7379,7 +7397,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm13[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm13, %ymm26
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm10 = ymm10[2,2,3,3,6,6,7,7]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = <9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10]
 ; AVX512F-SLOW-NEXT:    vmovdqa %ymm15, %ymm3
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm12, %ymm15, %ymm15
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm10, %zmm15, %zmm23
@@ -7419,7 +7437,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm15[0,1,0,1]
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = <11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u>
+; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm15 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12]
 ; AVX512F-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm15, %ymm0, %ymm1
 ; AVX512F-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -7438,18 +7456,21 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vmovdqa %ymm6, %ymm0
 ; AVX512F-SLOW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero
 ; AVX512F-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22>
+; AVX512F-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
+; AVX512F-SLOW-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
 ; AVX512F-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm6, %ymm15, %ymm0
 ; AVX512F-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31>
+; AVX512F-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
+; AVX512F-SLOW-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm25, %ymm0
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm6, %ymm0, %ymm3
 ; AVX512F-SLOW-NEXT:    vmovdqa %ymm4, %ymm15
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm6, %ymm4, %ymm4
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm4, %ymm29
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128>
+; AVX512F-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
+; AVX512F-SLOW-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm6, %ymm15, %ymm4
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm4, %ymm28
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
@@ -7493,7 +7514,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload
 ; AVX512F-SLOW-NEXT:    # zmm4 = mem[2,3,2,3,6,7,6,7]
 ; AVX512F-SLOW-NEXT:    vporq %zmm4, %zmm0, %zmm0
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655]
+; AVX512F-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655]
+; AVX512F-SLOW-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-SLOW-NEXT:    vpand %ymm4, %ymm11, %ymm8
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm8, %zmm9, %zmm8
 ; AVX512F-SLOW-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Folded Reload
@@ -7625,438 +7647,885 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vzeroupper
 ; AVX512F-SLOW-NEXT:    retq
 ;
-; AVX512F-FAST-LABEL: store_i8_stride7_vf64:
-; AVX512F-FAST:       # %bb.0:
-; AVX512F-FAST-NEXT:    subq $1432, %rsp # imm = 0x598
-; AVX512F-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-FAST-NEXT:    vmovdqa 32(%rcx), %ymm2
-; AVX512F-FAST-NEXT:    vmovdqa 32(%rdx), %ymm1
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm1, %ymm19
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm2, %ymm17
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa 32(%rsi), %ymm3
-; AVX512F-FAST-NEXT:    vmovdqa 32(%rdi), %ymm8
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero
-; AVX512F-FAST-NEXT:    vmovdqa %ymm3, %ymm10
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero,ymm8[27],zero,ymm8[25]
-; AVX512F-FAST-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa 32(%r9), %ymm2
-; AVX512F-FAST-NEXT:    vmovdqa 32(%r8), %ymm1
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29]
-; AVX512F-FAST-NEXT:    vmovdqa %ymm1, %ymm15
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero
-; AVX512F-FAST-NEXT:    vmovdqa %ymm2, %ymm6
-; AVX512F-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa 32(%rax), %ymm1
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa (%rsi), %ymm0
-; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero,ymm0[18]
-; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,14],zero,ymm1[12,13,0,1,14,15],zero,ymm1[3,12,13,2,3,16],zero,ymm1[30,31,28,29,16,17],zero,ymm1[31,18,19,28,29,18],zero
-; AVX512F-FAST-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa (%rcx), %ymm1
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
-; AVX512F-FAST-NEXT:    vpshufb %ymm13, %ymm1, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm1, %ymm24
-; AVX512F-FAST-NEXT:    vmovdqa (%rdx), %ymm1
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
-; AVX512F-FAST-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm3, %ymm16
-; AVX512F-FAST-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa (%r8), %ymm1
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
-; AVX512F-FAST-NEXT:    vpshufb %ymm9, %ymm1, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm1, %ymm25
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa (%r9), %ymm1
-; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[13,u,u,u,u,u],zero,ymm1[14,u,u,u,u,u],zero,ymm1[15,u,u,u,u,u],zero,ymm1[16,u,u,u,u,u],zero,ymm1[17,u,u,u]
-; AVX512F-FAST-NEXT:    vporq %ymm0, %ymm1, %ymm22
-; AVX512F-FAST-NEXT:    vmovdqa 32(%rdx), %xmm3
-; AVX512F-FAST-NEXT:    vmovdqa 32(%rcx), %xmm1
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u>
-; AVX512F-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm0
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm2, %xmm23
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm1, %xmm30
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u>
-; AVX512F-FAST-NEXT:    vpshufb %xmm2, %xmm3, %xmm1
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm3, %xmm27
-; AVX512F-FAST-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa 32(%rdi), %xmm5
-; AVX512F-FAST-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa 32(%rsi), %xmm3
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128>
-; AVX512F-FAST-NEXT:    vpshufb %xmm1, %xmm3, %xmm0
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm3, %xmm20
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9>
-; AVX512F-FAST-NEXT:    vpshufb %xmm4, %xmm5, %xmm3
-; AVX512F-FAST-NEXT:    vporq %xmm0, %xmm3, %xmm21
-; AVX512F-FAST-NEXT:    vmovdqa 32(%r9), %xmm12
-; AVX512F-FAST-NEXT:    vmovdqa 32(%r8), %xmm11
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6>
-; AVX512F-FAST-NEXT:    vpshufb %xmm7, %xmm12, %xmm3
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm12, %xmm29
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128>
-; AVX512F-FAST-NEXT:    vpshufb %xmm0, %xmm11, %xmm5
-; AVX512F-FAST-NEXT:    vpor %xmm3, %xmm5, %xmm3
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm17, %ymm3
-; AVX512F-FAST-NEXT:    vpshufb %ymm13, %ymm3, %ymm3
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm16, %ymm5
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm19, %ymm12
-; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm12, %ymm5
-; AVX512F-FAST-NEXT:    vpor %ymm3, %ymm5, %ymm3
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm10[14],zero,zero,zero,zero,zero,zero,ymm10[15],zero,zero,zero,zero,zero,zero,ymm10[16],zero,zero,zero,zero,zero,zero,ymm10[17],zero,zero,zero,zero,zero,zero,ymm10[18]
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm10, %ymm18
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm5 = ymm8[0,1,14],zero,ymm8[12,13,0,1,14,15],zero,ymm8[3,12,13,2,3,16],zero,ymm8[30,31,28,29,16,17],zero,ymm8[31,18,19,28,29,18],zero
-; AVX512F-FAST-NEXT:    vpor %ymm3, %ymm5, %ymm3
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb %ymm9, %ymm15, %ymm3
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm5 = ymm6[13,u,u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u]
-; AVX512F-FAST-NEXT:    vpor %ymm3, %ymm5, %ymm3
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa (%rsi), %xmm14
-; AVX512F-FAST-NEXT:    vpshufb %xmm1, %xmm14, %xmm1
-; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %xmm3
-; AVX512F-FAST-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm3, %xmm16
-; AVX512F-FAST-NEXT:    vpor %xmm1, %xmm4, %xmm1
-; AVX512F-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa (%rcx), %xmm10
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm23, %xmm1
-; AVX512F-FAST-NEXT:    vpshufb %xmm1, %xmm10, %xmm1
-; AVX512F-FAST-NEXT:    vmovdqa (%rdx), %xmm9
-; AVX512F-FAST-NEXT:    vpshufb %xmm2, %xmm9, %xmm4
-; AVX512F-FAST-NEXT:    vpor %xmm1, %xmm4, %xmm1
-; AVX512F-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa (%r9), %xmm2
-; AVX512F-FAST-NEXT:    vpshufb %xmm7, %xmm2, %xmm1
-; AVX512F-FAST-NEXT:    vmovdqa %xmm2, %xmm4
-; AVX512F-FAST-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa (%r8), %xmm2
-; AVX512F-FAST-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
-; AVX512F-FAST-NEXT:    vmovdqa %xmm2, %xmm5
-; AVX512F-FAST-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-FAST-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25]
-; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Reload
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm23, %ymm1
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm24, %ymm7
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero,ymm7[20]
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27>
-; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512F-FAST-NEXT:    vpshufb %ymm13, %ymm2, %ymm0
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm2[18,19,20,21],zero,ymm2[19],zero,ymm2[25,26,27,22],zero,ymm2[20],zero
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm22, %zmm0, %zmm0
-; AVX512F-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10>
-; AVX512F-FAST-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
-; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[4,5,6,7]
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3]
-; AVX512F-FAST-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512F-FAST-NEXT:    vmovdqa (%rax), %xmm0
-; AVX512F-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-FAST-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,6]
-; AVX512F-FAST-NEXT:    vpermd %ymm1, %ymm6, %ymm1
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-FAST-NEXT:    vpandn %ymm1, %ymm4, %ymm4
-; AVX512F-FAST-NEXT:    vmovdqa (%rax), %ymm0
-; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128]
-; AVX512F-FAST-NEXT:    vpshufb %ymm1, %ymm0, %ymm8
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm1, %ymm26
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm8, %zmm4, %zmm22
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm0, %ymm31
-; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm8 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm25, %ymm1
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3]
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm24 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800]
-; AVX512F-FAST-NEXT:    vpternlogq $248, %ymm24, %ymm8, %ymm15
-; AVX512F-FAST-NEXT:    vmovdqa %xmm11, %xmm4
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm29, %xmm2
-; AVX512F-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7]
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
-; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm29 = zmm15[0,1,2,3],zmm8[0,1,0,1]
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm27, %xmm0
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm30, %xmm1
-; AVX512F-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm3, %xmm28
-; AVX512F-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX512F-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm9, %xmm27
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm10, %xmm25
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7>
-; AVX512F-FAST-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb %xmm3, %xmm1, %xmm0
-; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-FAST-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm20, %xmm11
-; AVX512F-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
-; AVX512F-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15]
-; AVX512F-FAST-NEXT:    vmovdqa %xmm14, %xmm8
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm16, %xmm5
-; AVX512F-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm5[8],xmm14[9],xmm5[9],xmm14[10],xmm5[10],xmm14[11],xmm5[11],xmm14[12],xmm5[12],xmm14[13],xmm5[13],xmm14[14],xmm5[14],xmm14[15],xmm5[15]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u>
-; AVX512F-FAST-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb %xmm1, %xmm3, %xmm0
-; AVX512F-FAST-NEXT:    vinserti32x4 $2, %xmm0, %zmm21, %zmm30
-; AVX512F-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u>
-; AVX512F-FAST-NEXT:    vpshufb %ymm3, %ymm7, %ymm11
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm17, %ymm1
-; AVX512F-FAST-NEXT:    vpshufb %ymm3, %ymm1, %ymm3
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm21 = ymm3[2,3,2,3]
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm19, %ymm3
-; AVX512F-FAST-NEXT:    vpshufb %ymm13, %ymm3, %ymm10
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm20 = ymm10[2,3,2,3]
-; AVX512F-FAST-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
-; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,0,1],zmm0[0,1,0,1]
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa 32(%rax), %xmm10
-; AVX512F-FAST-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,6]
-; AVX512F-FAST-NEXT:    vpermd %ymm0, %ymm6, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20]
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm18, %ymm0
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm7 = ymm0[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm19 = ymm7[2,3,2,3]
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm3[18,19,20,21],zero,ymm3[19],zero,ymm3[25,26,27,22],zero,ymm3[20],zero
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm1, %ymm16
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero,zero
-; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm26, %ymm1
-; AVX512F-FAST-NEXT:    vpshufb %ymm1, %ymm0, %ymm7
-; AVX512F-FAST-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,4,5,5,7,4,5]
-; AVX512F-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm26 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-FAST-NEXT:    vpandnq %ymm0, %ymm26, %ymm0
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm7, %zmm7
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29>
-; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX512F-FAST-NEXT:    vpshufb %ymm4, %ymm15, %ymm13
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm17 = ymm13[2,3,2,3]
-; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm14 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm18 = ymm14[2,3,2,3]
-; AVX512F-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5>
-; AVX512F-FAST-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
-; AVX512F-FAST-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX512F-FAST-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
-; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm0[23],zero,ymm0[23,24,25,26],zero,ymm0[24],zero,ymm0[30,31]
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm0, %ymm26
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm13 = ymm1[2,3,2,3]
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm23, %ymm2
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero,zero
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm6 = ymm0[2,3,2,3]
-; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512F-FAST-NEXT:    vpshufb %ymm4, %ymm1, %ymm4
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm31 = ymm31[2,3,2,3]
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm27, %xmm0
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm25, %xmm5
-; AVX512F-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9>
-; AVX512F-FAST-NEXT:    vmovdqa64 %xmm28, %xmm5
-; AVX512F-FAST-NEXT:    vpshufb %xmm8, %xmm5, %xmm14
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm5 = ymm16[2,3,2,3]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3]
-; AVX512F-FAST-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23>
-; AVX512F-FAST-NEXT:    vpshufb %ymm8, %ymm15, %ymm15
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3]
-; AVX512F-FAST-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 16-byte Folded Reload
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX512F-FAST-NEXT:    vpshufb %ymm8, %ymm1, %ymm2
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3]
-; AVX512F-FAST-NEXT:    vpor %ymm6, %ymm4, %ymm1
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm9, %zmm1, %zmm1
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655]
-; AVX512F-FAST-NEXT:    vpternlogq $248, %ymm4, %ymm11, %ymm31
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm14, %zmm31, %zmm9
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
-; AVX512F-FAST-NEXT:    vpternlogq $226, %zmm1, %zmm11, %zmm9
-; AVX512F-FAST-NEXT:    vpor %ymm5, %ymm12, %ymm1
-; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm1, %zmm5, %zmm1
-; AVX512F-FAST-NEXT:    vpor %ymm3, %ymm15, %ymm3
-; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm3, %zmm5, %zmm3
-; AVX512F-FAST-NEXT:    vpternlogq $226, %zmm1, %zmm11, %zmm3
-; AVX512F-FAST-NEXT:    vpandq %ymm4, %ymm21, %ymm1
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm1, %zmm20, %zmm1
-; AVX512F-FAST-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload
-; AVX512F-FAST-NEXT:    # zmm4 = mem[2,3,2,3,6,7,6,7]
-; AVX512F-FAST-NEXT:    vporq %zmm4, %zmm1, %zmm1
-; AVX512F-FAST-NEXT:    vpandq %ymm24, %ymm19, %ymm4
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm17, %zmm4, %zmm4
-; AVX512F-FAST-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload
-; AVX512F-FAST-NEXT:    # zmm5 = mem[2,3,2,3,6,7,6,7]
-; AVX512F-FAST-NEXT:    vporq %zmm5, %zmm4, %zmm4
-; AVX512F-FAST-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm4
-; AVX512F-FAST-NEXT:    vpandq %ymm24, %ymm18, %ymm1
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm1, %zmm13, %zmm1
-; AVX512F-FAST-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload
-; AVX512F-FAST-NEXT:    # zmm5 = mem[2,3,2,3,6,7,6,7]
-; AVX512F-FAST-NEXT:    vporq %zmm5, %zmm1, %zmm1
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
-; AVX512F-FAST-NEXT:    vpternlogq $184, %zmm4, %zmm5, %zmm1
-; AVX512F-FAST-NEXT:    vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload
-; AVX512F-FAST-NEXT:    # zmm4 = mem[0,1,0,1,4,5,4,5]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} zmm11 = zmm30[0,1,0,1,4,5,4,5]
-; AVX512F-FAST-NEXT:    vpternlogq $226, %zmm4, %zmm5, %zmm11
-; AVX512F-FAST-NEXT:    vpandq %ymm24, %ymm0, %ymm0
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
-; AVX512F-FAST-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload
-; AVX512F-FAST-NEXT:    # zmm2 = mem[2,3,2,3,6,7,6,7]
-; AVX512F-FAST-NEXT:    vporq %zmm2, %zmm0, %zmm0
-; AVX512F-FAST-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload
-; AVX512F-FAST-NEXT:    # zmm2 = mem[2,3,2,3,6,7,6,7]
-; AVX512F-FAST-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload
-; AVX512F-FAST-NEXT:    # zmm4 = mem[2,3,2,3,6,7,6,7]
-; AVX512F-FAST-NEXT:    vporq %zmm2, %zmm4, %zmm2
-; AVX512F-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm5, %zmm2
-; AVX512F-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512F-FAST-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512F-FAST-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
-; AVX512F-FAST-NEXT:    vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload
-; AVX512F-FAST-NEXT:    # zmm16 = zmm0[0,1,0,1],mem[0,1,0,1]
-; AVX512F-FAST-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm10[1,1,0,0,4,5,6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,0,1,2,0,0,1]
-; AVX512F-FAST-NEXT:    vpermd %ymm4, %ymm5, %ymm18
-; AVX512F-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512F-FAST-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm0[1,1,0,0,4,5,6,7]
-; AVX512F-FAST-NEXT:    vpermd %ymm12, %ymm5, %ymm17
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
-; AVX512F-FAST-NEXT:    vpshufb %xmm12, %xmm10, %xmm10
-; AVX512F-FAST-NEXT:    vpshufb %xmm12, %xmm0, %xmm12
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22>
-; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512F-FAST-NEXT:    vpshufb %ymm13, %ymm0, %ymm14
-; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero
-; AVX512F-FAST-NEXT:    vpshufb %ymm13, %ymm0, %ymm13
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128>
-; AVX512F-FAST-NEXT:    vmovdqa64 %ymm26, %ymm4
-; AVX512F-FAST-NEXT:    vpshufb %ymm0, %ymm4, %ymm5
-; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm6[23],zero,ymm6[23,24,25,26],zero,ymm6[24],zero,ymm6[30,31]
-; AVX512F-FAST-NEXT:    vpshufb %ymm0, %ymm6, %ymm8
-; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512F-FAST-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm19 = [4,5,4,5,5,7,4,5]
-; AVX512F-FAST-NEXT:    vpermd %ymm6, %ymm19, %ymm19
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm6 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
-; AVX512F-FAST-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Folded Reload
-; AVX512F-FAST-NEXT:    # zmm20 = mem[2,3,2,3,6,7,6,7]
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm20
-; AVX512F-FAST-NEXT:    vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512F-FAST-NEXT:    vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    # ymm21 = mem[0,1,0,1]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm21 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm21
-; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-FAST-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm22
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm22
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm18, %zmm6, %zmm1
-; AVX512F-FAST-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm29
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm29
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 32-byte Folded Reload
-; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm1
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm1
-; AVX512F-FAST-NEXT:    vpor %ymm5, %ymm14, %ymm5
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
-; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7]
-; AVX512F-FAST-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7
-; AVX512F-FAST-NEXT:    vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload
-; AVX512F-FAST-NEXT:    # zmm3 = mem[0,1,0,1,4,5,4,5]
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} zmm5 = zmm23[0,1,0,1,4,5,4,5]
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm12, %zmm17, %zmm3
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm3
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3
-; AVX512F-FAST-NEXT:    vpor %ymm4, %ymm15, %ymm4
-; AVX512F-FAST-NEXT:    vpor %ymm13, %ymm8, %ymm5
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm4
-; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm19, %zmm0
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0
-; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
-; AVX512F-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-FAST-NEXT:    vmovdqa64 %zmm0, 128(%rax)
-; AVX512F-FAST-NEXT:    vmovdqa64 %zmm3, (%rax)
-; AVX512F-FAST-NEXT:    vmovdqa64 %zmm7, 320(%rax)
-; AVX512F-FAST-NEXT:    vmovdqa64 %zmm1, 256(%rax)
-; AVX512F-FAST-NEXT:    vmovdqa64 %zmm29, 192(%rax)
-; AVX512F-FAST-NEXT:    vmovdqa64 %zmm22, 64(%rax)
-; AVX512F-FAST-NEXT:    vmovdqa64 %zmm20, 384(%rax)
-; AVX512F-FAST-NEXT:    addq $1432, %rsp # imm = 0x598
-; AVX512F-FAST-NEXT:    vzeroupper
-; AVX512F-FAST-NEXT:    retq
+; AVX512F-ONLY-FAST-LABEL: store_i8_stride7_vf64:
+; AVX512F-ONLY-FAST:       # %bb.0:
+; AVX512F-ONLY-FAST-NEXT:    subq $1432, %rsp # imm = 0x598
+; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rcx), %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rdx), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm1, %ymm19
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm2, %ymm17
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rsi), %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rdi), %ymm8
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %ymm3, %ymm10
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero,ymm8[27],zero,ymm8[25]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%r9), %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%r8), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %ymm1, %ymm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %ymm2, %ymm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rax), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero,ymm0[18]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,14],zero,ymm1[12,13,0,1,14,15],zero,ymm1[3,12,13,2,3,16],zero,ymm1[30,31,28,29,16,17],zero,ymm1[31,18,19,28,29,18],zero
+; AVX512F-ONLY-FAST-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rcx), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm13, %ymm1, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm1, %ymm24
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdx), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm3, %ymm16
+; AVX512F-ONLY-FAST-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%r8), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm9, %ymm1, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm1, %ymm25
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%r9), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[13,u,u,u,u,u],zero,ymm1[14,u,u,u,u,u],zero,ymm1[15,u,u,u,u,u],zero,ymm1[16,u,u,u,u,u],zero,ymm1[17,u,u,u]
+; AVX512F-ONLY-FAST-NEXT:    vporq %ymm0, %ymm1, %ymm22
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rdx), %xmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rcx), %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm2, %xmm23
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm1, %xmm30
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm2, %xmm3, %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm3, %xmm27
+; AVX512F-ONLY-FAST-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rdi), %xmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rsi), %xmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128>
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm1, %xmm3, %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm3, %xmm20
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9>
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm4, %xmm5, %xmm3
+; AVX512F-ONLY-FAST-NEXT:    vporq %xmm0, %xmm3, %xmm21
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%r9), %xmm12
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%r8), %xmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6>
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm7, %xmm12, %xmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm12, %xmm29
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128>
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm0, %xmm11, %xmm5
+; AVX512F-ONLY-FAST-NEXT:    vpor %xmm3, %xmm5, %xmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm17, %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm13, %ymm3, %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm16, %ymm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm19, %ymm12
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm5, %ymm12, %ymm5
+; AVX512F-ONLY-FAST-NEXT:    vpor %ymm3, %ymm5, %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm10[14],zero,zero,zero,zero,zero,zero,ymm10[15],zero,zero,zero,zero,zero,zero,ymm10[16],zero,zero,zero,zero,zero,zero,ymm10[17],zero,zero,zero,zero,zero,zero,ymm10[18]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm10, %ymm18
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm5 = ymm8[0,1,14],zero,ymm8[12,13,0,1,14,15],zero,ymm8[3,12,13,2,3,16],zero,ymm8[30,31,28,29,16,17],zero,ymm8[31,18,19,28,29,18],zero
+; AVX512F-ONLY-FAST-NEXT:    vpor %ymm3, %ymm5, %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm9, %ymm15, %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm5 = ymm6[13,u,u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u]
+; AVX512F-ONLY-FAST-NEXT:    vpor %ymm3, %ymm5, %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rsi), %xmm14
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm1, %xmm14, %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdi), %xmm3
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm3, %xmm16
+; AVX512F-ONLY-FAST-NEXT:    vpor %xmm1, %xmm4, %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rcx), %xmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm23, %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm1, %xmm10, %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm9
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm2, %xmm9, %xmm4
+; AVX512F-ONLY-FAST-NEXT:    vpor %xmm1, %xmm4, %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%r9), %xmm2
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm7, %xmm2, %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm2, %xmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%r8), %xmm2
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm2, %xmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm23, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm24, %ymm7
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero,ymm7[20]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
+; AVX512F-ONLY-FAST-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm13, %ymm2, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm2[18,19,20,21],zero,ymm2[19],zero,ymm2[25,26,27,22],zero,ymm2[20],zero
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm22, %zmm0, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10>
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3]
+; AVX512F-ONLY-FAST-NEXT:    # ymm6 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rax), %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,6]
+; AVX512F-ONLY-FAST-NEXT:    vpermd %ymm1, %ymm6, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512F-ONLY-FAST-NEXT:    vpandn %ymm1, %ymm4, %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rax), %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm1, %ymm0, %ymm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm1, %ymm26
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm8, %zmm4, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm0, %ymm31
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm8 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm25, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm24 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800]
+; AVX512F-ONLY-FAST-NEXT:    # ymm24 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $248, %ymm24, %ymm8, %ymm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm11, %xmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm29, %xmm2
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm29 = zmm15[0,1,2,3],zmm8[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm27, %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm30, %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm3, %xmm28
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm9, %xmm27
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm10, %xmm25
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7>
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm3, %xmm1, %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm20, %xmm11
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm14, %xmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm16, %xmm5
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm5[8],xmm14[9],xmm5[9],xmm14[10],xmm5[10],xmm14[11],xmm5[11],xmm14[12],xmm5[12],xmm14[13],xmm5[13],xmm14[14],xmm5[14],xmm14[15],xmm5[15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u>
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm1, %xmm3, %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $2, %xmm0, %zmm21, %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm3, %ymm7, %ymm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm17, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm3, %ymm1, %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm21 = ymm3[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm19, %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm13, %ymm3, %ymm10
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm20 = ymm10[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,0,1],zmm0[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rax), %xmm10
+; AVX512F-ONLY-FAST-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,6]
+; AVX512F-ONLY-FAST-NEXT:    vpermd %ymm0, %ymm6, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm18, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm7 = ymm0[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm19 = ymm7[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm3[18,19,20,21],zero,ymm3[19],zero,ymm3[25,26,27,22],zero,ymm3[20],zero
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm1, %ymm16
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero,zero
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm26, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm1, %ymm0, %ymm7
+; AVX512F-ONLY-FAST-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,4,5,5,7,4,5]
+; AVX512F-ONLY-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm26 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512F-ONLY-FAST-NEXT:    vpandnq %ymm0, %ymm26, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm7, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
+; AVX512F-ONLY-FAST-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm4, %ymm15, %ymm13
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm17 = ymm13[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm14 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm18 = ymm14[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5>
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm0[23],zero,ymm0[23,24,25,26],zero,ymm0[24],zero,ymm0[30,31]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm0, %ymm26
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm13 = ymm1[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm23, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero,zero
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm6 = ymm0[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm4, %ymm1, %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm31 = ymm31[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm27, %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm25, %xmm5
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm28, %xmm5
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm8, %xmm5, %xmm14
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm5 = ymm16[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
+; AVX512F-ONLY-FAST-NEXT:    # ymm8 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm8, %ymm15, %ymm15
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 16-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm8, %ymm1, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpor %ymm6, %ymm4, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm9, %zmm1, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655]
+; AVX512F-ONLY-FAST-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $248, %ymm4, %ymm11, %ymm31
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm14, %zmm31, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $226, %zmm1, %zmm11, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vpor %ymm5, %ymm12, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm1, %zmm5, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpor %ymm3, %ymm15, %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm3, %zmm5, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $226, %zmm1, %zmm11, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpandq %ymm4, %ymm21, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm1, %zmm20, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # zmm4 = mem[2,3,2,3,6,7,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vporq %zmm4, %zmm1, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpandq %ymm24, %ymm19, %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm17, %zmm4, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # zmm5 = mem[2,3,2,3,6,7,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vporq %zmm5, %zmm4, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpandq %ymm24, %ymm18, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm1, %zmm13, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # zmm5 = mem[2,3,2,3,6,7,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vporq %zmm5, %zmm1, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $184, %zmm4, %zmm5, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # zmm4 = mem[0,1,0,1,4,5,4,5]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} zmm11 = zmm30[0,1,0,1,4,5,4,5]
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $226, %zmm4, %zmm5, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpandq %ymm24, %ymm0, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # zmm2 = mem[2,3,2,3,6,7,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vporq %zmm2, %zmm0, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # zmm2 = mem[2,3,2,3,6,7,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # zmm4 = mem[2,3,2,3,6,7,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vporq %zmm2, %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm5, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # zmm16 = zmm0[0,1,0,1],mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm10[1,1,0,0,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,0,1,2,0,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermd %ymm4, %ymm5, %ymm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm0[1,1,0,0,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermd %ymm12, %ymm5, %ymm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm12, %xmm10, %xmm10
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm12, %xmm0, %xmm12
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
+; AVX512F-ONLY-FAST-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm13, %ymm0, %ymm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm13, %ymm0, %ymm13
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
+; AVX512F-ONLY-FAST-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm26, %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm0, %ymm4, %ymm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm6[23],zero,ymm6[23,24,25,26],zero,ymm6[24],zero,ymm6[30,31]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm0, %ymm6, %ymm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm19 = [4,5,4,5,5,7,4,5]
+; AVX512F-ONLY-FAST-NEXT:    vpermd %ymm6, %ymm19, %ymm19
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm6 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512F-ONLY-FAST-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # zmm20 = mem[2,3,2,3,6,7,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm21 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm21 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm18, %zmm6, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpor %ymm5, %ymm14, %ymm5
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,0,1,4,5,4,5]
+; AVX512F-ONLY-FAST-NEXT:    vpermq {{.*#+}} zmm5 = zmm23[0,1,0,1,4,5,4,5]
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm12, %zmm17, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpor %ymm4, %ymm15, %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vpor %ymm13, %ymm8, %ymm5
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm19, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, (%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, 320(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 256(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, 384(%rax)
+; AVX512F-ONLY-FAST-NEXT:    addq $1432, %rsp # imm = 0x598
+; AVX512F-ONLY-FAST-NEXT:    vzeroupper
+; AVX512F-ONLY-FAST-NEXT:    retq
+;
+; AVX512DQ-FAST-LABEL: store_i8_stride7_vf64:
+; AVX512DQ-FAST:       # %bb.0:
+; AVX512DQ-FAST-NEXT:    subq $1432, %rsp # imm = 0x598
+; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rcx), %ymm2
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rdx), %ymm1
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm1, %ymm19
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm2, %ymm17
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rsi), %ymm3
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rdi), %ymm8
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero
+; AVX512DQ-FAST-NEXT:    vmovdqa %ymm3, %ymm10
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero,ymm8[27],zero,ymm8[25]
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%r9), %ymm2
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%r8), %ymm1
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29]
+; AVX512DQ-FAST-NEXT:    vmovdqa %ymm1, %ymm15
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero
+; AVX512DQ-FAST-NEXT:    vmovdqa %ymm2, %ymm6
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rax), %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero,ymm0[18]
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,14],zero,ymm1[12,13,0,1,14,15],zero,ymm1[3,12,13,2,3,16],zero,ymm1[30,31,28,29,16,17],zero,ymm1[31,18,19,28,29,18],zero
+; AVX512DQ-FAST-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rcx), %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm13, %ymm1, %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm1, %ymm24
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdx), %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm3, %ymm16
+; AVX512DQ-FAST-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa (%r8), %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm9, %ymm1, %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm1, %ymm25
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa (%r9), %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[13,u,u,u,u,u],zero,ymm1[14,u,u,u,u,u],zero,ymm1[15,u,u,u,u,u],zero,ymm1[16,u,u,u,u,u],zero,ymm1[17,u,u,u]
+; AVX512DQ-FAST-NEXT:    vporq %ymm0, %ymm1, %ymm22
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rdx), %xmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rcx), %xmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u>
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm2, %xmm23
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm1, %xmm30
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u>
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm2, %xmm3, %xmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm3, %xmm27
+; AVX512DQ-FAST-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rdi), %xmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rsi), %xmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128>
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm1, %xmm3, %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm3, %xmm20
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9>
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm4, %xmm5, %xmm3
+; AVX512DQ-FAST-NEXT:    vporq %xmm0, %xmm3, %xmm21
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%r9), %xmm12
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%r8), %xmm11
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6>
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm7, %xmm12, %xmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm12, %xmm29
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128>
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm0, %xmm11, %xmm5
+; AVX512DQ-FAST-NEXT:    vpor %xmm3, %xmm5, %xmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm17, %ymm3
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm13, %ymm3, %ymm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm16, %ymm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm19, %ymm12
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm5, %ymm12, %ymm5
+; AVX512DQ-FAST-NEXT:    vpor %ymm3, %ymm5, %ymm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm10[14],zero,zero,zero,zero,zero,zero,ymm10[15],zero,zero,zero,zero,zero,zero,ymm10[16],zero,zero,zero,zero,zero,zero,ymm10[17],zero,zero,zero,zero,zero,zero,ymm10[18]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm10, %ymm18
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm5 = ymm8[0,1,14],zero,ymm8[12,13,0,1,14,15],zero,ymm8[3,12,13,2,3,16],zero,ymm8[30,31,28,29,16,17],zero,ymm8[31,18,19,28,29,18],zero
+; AVX512DQ-FAST-NEXT:    vpor %ymm3, %ymm5, %ymm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm9, %ymm15, %ymm3
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm5 = ymm6[13,u,u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u]
+; AVX512DQ-FAST-NEXT:    vpor %ymm3, %ymm5, %ymm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rsi), %xmm14
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm1, %xmm14, %xmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %xmm3
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm3, %xmm16
+; AVX512DQ-FAST-NEXT:    vpor %xmm1, %xmm4, %xmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rcx), %xmm10
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm23, %xmm1
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm1, %xmm10, %xmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdx), %xmm9
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm2, %xmm9, %xmm4
+; AVX512DQ-FAST-NEXT:    vpor %xmm1, %xmm4, %xmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa (%r9), %xmm2
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm7, %xmm2, %xmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa %xmm2, %xmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa (%r8), %xmm2
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa %xmm2, %xmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm23, %ymm1
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm24, %ymm7
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero,ymm7[20]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
+; AVX512DQ-FAST-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm13, %ymm2, %ymm0
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm2[18,19,20,21],zero,ymm2[19],zero,ymm2[25,26,27,22],zero,ymm2[20],zero
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm22, %zmm0, %zmm0
+; AVX512DQ-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10>
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3]
+; AVX512DQ-FAST-NEXT:    # ymm6 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rax), %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,6]
+; AVX512DQ-FAST-NEXT:    vpermd %ymm1, %ymm6, %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512DQ-FAST-NEXT:    vpandn %ymm1, %ymm4, %ymm4
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rax), %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128]
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm1, %ymm0, %ymm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm1, %ymm26
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm8, %zmm4, %zmm22
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm0, %ymm31
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm8 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm25, %ymm1
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm24 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800]
+; AVX512DQ-FAST-NEXT:    # ymm24 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vpternlogq $248, %ymm24, %ymm8, %ymm15
+; AVX512DQ-FAST-NEXT:    vmovdqa %xmm11, %xmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm29, %xmm2
+; AVX512DQ-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7]
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm29 = zmm15[0,1,2,3],zmm8[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm27, %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm30, %xmm1
+; AVX512DQ-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm3, %xmm28
+; AVX512DQ-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX512DQ-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm9, %xmm27
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm10, %xmm25
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7>
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm3, %xmm1, %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm20, %xmm11
+; AVX512DQ-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
+; AVX512DQ-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15]
+; AVX512DQ-FAST-NEXT:    vmovdqa %xmm14, %xmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm16, %xmm5
+; AVX512DQ-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm5[8],xmm14[9],xmm5[9],xmm14[10],xmm5[10],xmm14[11],xmm5[11],xmm14[12],xmm5[12],xmm14[13],xmm5[13],xmm14[14],xmm5[14],xmm14[15],xmm5[15]
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u>
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm1, %xmm3, %xmm0
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $2, %xmm0, %zmm21, %zmm30
+; AVX512DQ-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; AVX512DQ-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12]
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm3, %ymm7, %ymm11
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm17, %ymm1
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm3, %ymm1, %ymm3
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm21 = ymm3[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm19, %ymm3
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm13, %ymm3, %ymm10
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm20 = ymm10[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,0,1],zmm0[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rax), %xmm10
+; AVX512DQ-FAST-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,6]
+; AVX512DQ-FAST-NEXT:    vpermd %ymm0, %ymm6, %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm18, %ymm0
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm7 = ymm0[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm19 = ymm7[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm3[18,19,20,21],zero,ymm3[19],zero,ymm3[25,26,27,22],zero,ymm3[20],zero
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm1, %ymm16
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero,zero
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm26, %ymm1
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm1, %ymm0, %ymm7
+; AVX512DQ-FAST-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,4,5,5,7,4,5]
+; AVX512DQ-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm26 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512DQ-FAST-NEXT:    vpandnq %ymm0, %ymm26, %ymm0
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm7, %zmm7
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
+; AVX512DQ-FAST-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm4, %ymm15, %ymm13
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm17 = ymm13[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm14 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm18 = ymm14[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5>
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm1, %xmm9, %xmm9
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm0[23],zero,ymm0[23,24,25,26],zero,ymm0[24],zero,ymm0[30,31]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm0, %ymm26
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm13 = ymm1[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm23, %ymm2
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero,zero
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm6 = ymm0[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm4, %ymm1, %ymm4
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm31 = ymm31[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm27, %xmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm25, %xmm5
+; AVX512DQ-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm28, %xmm5
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm8, %xmm5, %xmm14
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm5 = ymm16[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
+; AVX512DQ-FAST-NEXT:    # ymm8 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm8, %ymm15, %ymm15
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 16-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm8, %ymm1, %ymm2
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vpor %ymm6, %ymm4, %ymm1
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm9, %zmm1, %zmm1
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655]
+; AVX512DQ-FAST-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vpternlogq $248, %ymm4, %ymm11, %ymm31
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm14, %zmm31, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
+; AVX512DQ-FAST-NEXT:    vpternlogq $226, %zmm1, %zmm11, %zmm9
+; AVX512DQ-FAST-NEXT:    vpor %ymm5, %ymm12, %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm1, %zmm5, %zmm1
+; AVX512DQ-FAST-NEXT:    vpor %ymm3, %ymm15, %ymm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm3, %zmm5, %zmm3
+; AVX512DQ-FAST-NEXT:    vpternlogq $226, %zmm1, %zmm11, %zmm3
+; AVX512DQ-FAST-NEXT:    vpandq %ymm4, %ymm21, %ymm1
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm1, %zmm20, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # zmm4 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-FAST-NEXT:    vporq %zmm4, %zmm1, %zmm1
+; AVX512DQ-FAST-NEXT:    vpandq %ymm24, %ymm19, %ymm4
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm17, %zmm4, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # zmm5 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-FAST-NEXT:    vporq %zmm5, %zmm4, %zmm4
+; AVX512DQ-FAST-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm4
+; AVX512DQ-FAST-NEXT:    vpandq %ymm24, %ymm18, %ymm1
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm1, %zmm13, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # zmm5 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-FAST-NEXT:    vporq %zmm5, %zmm1, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
+; AVX512DQ-FAST-NEXT:    vpternlogq $184, %zmm4, %zmm5, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # zmm4 = mem[0,1,0,1,4,5,4,5]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} zmm11 = zmm30[0,1,0,1,4,5,4,5]
+; AVX512DQ-FAST-NEXT:    vpternlogq $226, %zmm4, %zmm5, %zmm11
+; AVX512DQ-FAST-NEXT:    vpandq %ymm24, %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # zmm2 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-FAST-NEXT:    vporq %zmm2, %zmm0, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # zmm2 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-FAST-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # zmm4 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-FAST-NEXT:    vporq %zmm2, %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT:    vpternlogq $226, %zmm0, %zmm5, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
+; AVX512DQ-FAST-NEXT:    vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # zmm16 = zmm0[0,1,0,1],mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm10[1,1,0,0,4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,0,1,2,0,0,1]
+; AVX512DQ-FAST-NEXT:    vpermd %ymm4, %ymm5, %ymm18
+; AVX512DQ-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-FAST-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm0[1,1,0,0,4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpermd %ymm12, %ymm5, %ymm17
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm12, %xmm10, %xmm10
+; AVX512DQ-FAST-NEXT:    vpshufb %xmm12, %xmm0, %xmm12
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
+; AVX512DQ-FAST-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm13, %ymm0, %ymm14
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm13, %ymm0, %ymm13
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
+; AVX512DQ-FAST-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm26, %ymm4
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm0, %ymm4, %ymm5
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm6[23],zero,ymm6[23,24,25,26],zero,ymm6[24],zero,ymm6[30,31]
+; AVX512DQ-FAST-NEXT:    vpshufb %ymm0, %ymm6, %ymm8
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm19 = [4,5,4,5,5,7,4,5]
+; AVX512DQ-FAST-NEXT:    vpermd %ymm6, %ymm19, %ymm19
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm6 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
+; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512DQ-FAST-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # zmm20 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm20
+; AVX512DQ-FAST-NEXT:    vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm21 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm21 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm21
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm22
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm22
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm18, %zmm6, %zmm1
+; AVX512DQ-FAST-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm29
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm29
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm1
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm1
+; AVX512DQ-FAST-NEXT:    vpor %ymm5, %ymm14, %ymm5
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # zmm3 = mem[0,1,0,1,4,5,4,5]
+; AVX512DQ-FAST-NEXT:    vpermq {{.*#+}} zmm5 = zmm23[0,1,0,1,4,5,4,5]
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm12, %zmm17, %zmm3
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm3
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3
+; AVX512DQ-FAST-NEXT:    vpor %ymm4, %ymm15, %ymm4
+; AVX512DQ-FAST-NEXT:    vpor %ymm13, %ymm8, %ymm5
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm19, %zmm0
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0
+; AVX512DQ-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, 128(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, (%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, 320(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, 256(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, 192(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, 64(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, 384(%rax)
+; AVX512DQ-FAST-NEXT:    addq $1432, %rsp # imm = 0x598
+; AVX512DQ-FAST-NEXT:    vzeroupper
+; AVX512DQ-FAST-NEXT:    retq
 ;
 ; AVX512BW-SLOW-LABEL: store_i8_stride7_vf64:
 ; AVX512BW-SLOW:       # %bb.0:
@@ -8170,7 +8639,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-SLOW-NEXT:    vmovdqu8 %zmm4, %zmm11 {%k2}
 ; AVX512BW-SLOW-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm20[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
 ; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,14,u,12,u,u,u,u,15,u,13,u,u,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u,u,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14]
 ; AVX512BW-SLOW-NEXT:    movl $338170920, %esi # imm = 0x14281428
 ; AVX512BW-SLOW-NEXT:    kmovd %esi, %k2
 ; AVX512BW-SLOW-NEXT:    vpshufb %ymm6, %ymm21, %ymm3 {%k2}
@@ -8181,7 +8650,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm26
 ; AVX512BW-SLOW-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm17[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
 ; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,3,3,4,6,7,7]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = <11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u>
+; AVX512BW-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12]
 ; AVX512BW-SLOW-NEXT:    vpshufb %ymm7, %ymm18, %ymm4
 ; AVX512BW-SLOW-NEXT:    vmovdqu8 %ymm3, %ymm4 {%k1}
 ; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm19 = ymm4[2,3,2,3]
@@ -8215,7 +8684,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm25[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
 ; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
 ; AVX512BW-SLOW-NEXT:    vpshufb %ymm6, %ymm2, %ymm0 {%k2}
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128>
+; AVX512BW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128]
+; AVX512BW-SLOW-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX512BW-SLOW-NEXT:    vpshufb %ymm6, %ymm2, %ymm2
 ; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3]
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm24 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
@@ -8711,11 +9181,9 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512: {{.*}}
 ; AVX512BW-ONLY-FAST: {{.*}}
 ; AVX512BW-ONLY-SLOW: {{.*}}
-; AVX512DQ-FAST: {{.*}}
 ; AVX512DQ-SLOW: {{.*}}
 ; AVX512DQBW-FAST: {{.*}}
 ; AVX512DQBW-SLOW: {{.*}}
-; AVX512F-ONLY-FAST: {{.*}}
 ; AVX512F-ONLY-SLOW: {{.*}}
 ; FALLBACK0: {{.*}}
 ; FALLBACK1: {{.*}}

diff  --git a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll
index 64f8ed9c20436..12d6b9cb582fc 100644
--- a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll
+++ b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll
@@ -65,7 +65,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ;
 ; AVX2-LABEL: testv4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm3
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
@@ -93,7 +94,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ;
 ; AVX512VL-LABEL: testv4i64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
 ; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3
@@ -121,7 +123,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ;
 ; AVX512VLBWDQ-LABEL: testv4i64:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm3
 ; AVX512VLBWDQ-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3
@@ -161,7 +164,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ;
 ; X32-AVX-LABEL: testv4i64:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    # ymm1 = mem[0,1,0,1]
 ; X32-AVX-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm3
 ; X32-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm3
@@ -247,7 +251,8 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ;
 ; AVX2-LABEL: testv4i64u:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm3
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
@@ -275,7 +280,8 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ;
 ; AVX512VL-LABEL: testv4i64u:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
 ; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3
@@ -303,7 +309,8 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ;
 ; AVX512VLBWDQ-LABEL: testv4i64u:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm3
 ; AVX512VLBWDQ-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3
@@ -343,7 +350,8 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ;
 ; X32-AVX-LABEL: testv4i64u:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    # ymm1 = mem[0,1,0,1]
 ; X32-AVX-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm3
 ; X32-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm3
@@ -419,7 +427,8 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ;
 ; AVX2-LABEL: testv8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm3
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
@@ -442,7 +451,8 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ;
 ; AVX512VL-LABEL: testv8i32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
 ; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3
@@ -465,7 +475,8 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ;
 ; AVX512VLBWDQ-LABEL: testv8i32:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm3
 ; AVX512VLBWDQ-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3
@@ -500,7 +511,8 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ;
 ; X32-AVX-LABEL: testv8i32:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    # ymm1 = mem[0,1,0,1]
 ; X32-AVX-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm3
 ; X32-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm3
@@ -571,7 +583,8 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ;
 ; AVX2-LABEL: testv8i32u:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm3
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
@@ -594,7 +607,8 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ;
 ; AVX512VL-LABEL: testv8i32u:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
 ; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3
@@ -617,7 +631,8 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ;
 ; AVX512VLBWDQ-LABEL: testv8i32u:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm3
 ; AVX512VLBWDQ-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3
@@ -652,7 +667,8 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ;
 ; X32-AVX-LABEL: testv8i32u:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    # ymm1 = mem[0,1,0,1]
 ; X32-AVX-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm3
 ; X32-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm3
@@ -713,7 +729,8 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ;
 ; AVX2-LABEL: testv16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm3
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
@@ -731,7 +748,8 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ;
 ; AVX512VL-LABEL: testv16i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
 ; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3
@@ -749,7 +767,8 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ;
 ; AVX512VLBWDQ-LABEL: testv16i16:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm3
 ; AVX512VLBWDQ-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3
@@ -775,7 +794,8 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ;
 ; X32-AVX-LABEL: testv16i16:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    # ymm1 = mem[0,1,0,1]
 ; X32-AVX-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm3
 ; X32-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm3
@@ -830,7 +850,8 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ;
 ; AVX2-LABEL: testv16i16u:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm3
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
@@ -848,7 +869,8 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ;
 ; AVX512VL-LABEL: testv16i16u:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
 ; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3
@@ -866,7 +888,8 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ;
 ; AVX512VLBWDQ-LABEL: testv16i16u:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm3
 ; AVX512VLBWDQ-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3
@@ -892,7 +915,8 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ;
 ; X32-AVX-LABEL: testv16i16u:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    # ymm1 = mem[0,1,0,1]
 ; X32-AVX-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm3
 ; X32-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm3
@@ -937,7 +961,8 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ;
 ; AVX2-LABEL: testv32i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -950,7 +975,8 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ;
 ; AVX512VL-LABEL: testv32i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
@@ -963,7 +989,8 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ;
 ; AVX512VLBWDQ-LABEL: testv32i8:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VLBWDQ-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
@@ -989,7 +1016,8 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ;
 ; X32-AVX-LABEL: testv32i8:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    # ymm1 = mem[0,1,0,1]
 ; X32-AVX-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
@@ -1029,7 +1057,8 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ;
 ; AVX2-LABEL: testv32i8u:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -1042,7 +1071,8 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ;
 ; AVX512VL-LABEL: testv32i8u:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512VL-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
@@ -1055,7 +1085,8 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ;
 ; AVX512VLBWDQ-LABEL: testv32i8u:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512VLBWDQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VLBWDQ-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
@@ -1081,7 +1112,8 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ;
 ; X32-AVX-LABEL: testv32i8u:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-AVX-NEXT:    # ymm1 = mem[0,1,0,1]
 ; X32-AVX-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0

diff  --git a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll
index c015185fe4511..a724babe469c5 100644
--- a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll
+++ b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll
@@ -29,9 +29,10 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
 ; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti64x2 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,0,1,0,1,0,1]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
@@ -56,12 +57,13 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
 ; AVX512DQ-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpsrlq $32, %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512DQ-NEXT:    vpandn %ymm1, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; AVX512DQ-NEXT:    vpand %ymm1, %ymm3, %ymm4
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512DQ-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
 ; AVX512DQ-NEXT:    vpsrlw $4, %ymm3, %ymm3
 ; AVX512DQ-NEXT:    vpand %ymm1, %ymm3, %ymm3
@@ -106,9 +108,10 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
 ; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti64x2 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,0,1,0,1,0,1]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
@@ -133,12 +136,13 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
 ; AVX512DQ-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpsrlq $32, %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512DQ-NEXT:    vpandn %ymm1, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; AVX512DQ-NEXT:    vpand %ymm1, %ymm3, %ymm4
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512DQ-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
 ; AVX512DQ-NEXT:    vpsrlw $4, %ymm3, %ymm3
 ; AVX512DQ-NEXT:    vpand %ymm1, %ymm3, %ymm3
@@ -181,9 +185,10 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 ; AVX512BW-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrld $16, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpord %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti64x2 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,0,1,0,1,0,1]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
@@ -210,12 +215,13 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 ; AVX512DQ-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpsrld $16, %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vpord %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512DQ-NEXT:    vpandn %ymm1, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; AVX512DQ-NEXT:    vpand %ymm1, %ymm3, %ymm4
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512DQ-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
 ; AVX512DQ-NEXT:    vpsrlw $4, %ymm3, %ymm3
 ; AVX512DQ-NEXT:    vpand %ymm1, %ymm3, %ymm3
@@ -266,9 +272,10 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
 ; AVX512BW-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrld $16, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpord %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti64x2 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,0,1,0,1,0,1]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
@@ -295,12 +302,13 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
 ; AVX512DQ-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpsrld $16, %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vpord %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512DQ-NEXT:    vpandn %ymm1, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; AVX512DQ-NEXT:    vpand %ymm1, %ymm3, %ymm4
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512DQ-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
 ; AVX512DQ-NEXT:    vpsrlw $4, %ymm3, %ymm3
 ; AVX512DQ-NEXT:    vpand %ymm1, %ymm3, %ymm3
@@ -335,7 +343,7 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
 ; AVX512CD-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
 ; AVX512CD-NEXT:    vplzcntd %zmm1, %zmm1
 ; AVX512CD-NEXT:    vpmovdw %zmm1, %ymm1
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512CD-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX512CD-NEXT:    vpsubw %ymm2, %ymm1, %ymm1
 ; AVX512CD-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
@@ -361,7 +369,8 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vbroadcasti64x2 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    # zmm2 = mem[0,1,0,1,0,1,0,1]
 ; AVX512BW-NEXT:    vpshufb %zmm1, %zmm2, %zmm3
 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vptestnmb %zmm1, %zmm1, %k0
@@ -379,10 +388,11 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
 ; AVX512DQ-LABEL: testv32i16:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm3
 ; AVX512DQ-NEXT:    vpsrlw $4, %ymm1, %ymm4
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpbroadcastb {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512DQ-NEXT:    vpand %ymm5, %ymm4, %ymm4
 ; AVX512DQ-NEXT:    vpxor %xmm6, %xmm6, %xmm6
 ; AVX512DQ-NEXT:    vpcmpeqb %ymm6, %ymm4, %ymm7
@@ -419,7 +429,7 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
 ; AVX512CD-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
 ; AVX512CD-NEXT:    vplzcntd %zmm1, %zmm1
 ; AVX512CD-NEXT:    vpmovdw %zmm1, %ymm1
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512CD-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX512CD-NEXT:    vpsubw %ymm2, %ymm1, %ymm1
 ; AVX512CD-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
@@ -445,7 +455,8 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vbroadcasti64x2 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    # zmm2 = mem[0,1,0,1,0,1,0,1]
 ; AVX512BW-NEXT:    vpshufb %zmm1, %zmm2, %zmm3
 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vptestnmb %zmm1, %zmm1, %k0
@@ -463,10 +474,11 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
 ; AVX512DQ-LABEL: testv32i16u:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm3
 ; AVX512DQ-NEXT:    vpsrlw $4, %ymm1, %ymm4
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpbroadcastb {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512DQ-NEXT:    vpand %ymm5, %ymm4, %ymm4
 ; AVX512DQ-NEXT:    vpxor %xmm6, %xmm6, %xmm6
 ; AVX512DQ-NEXT:    vpcmpeqb %ymm6, %ymm4, %ymm7
@@ -508,7 +520,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
 ; AVX512CD-NEXT:    vplzcntd %zmm1, %zmm1
 ; AVX512CD-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512CD-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
+; AVX512CD-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
 ; AVX512CD-NEXT:    vpsubb %ymm2, %ymm1, %ymm1
 ; AVX512CD-NEXT:    vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512CD-NEXT:    vplzcntd %zmm3, %zmm3
@@ -549,7 +561,8 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vbroadcasti64x2 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    # zmm2 = mem[0,1,0,1,0,1,0,1]
 ; AVX512BW-NEXT:    vpshufb %zmm1, %zmm2, %zmm3
 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    vptestnmb %zmm1, %zmm1, %k0
@@ -561,10 +574,11 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
 ; AVX512DQ-LABEL: testv64i8:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm3
 ; AVX512DQ-NEXT:    vpsrlw $4, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512DQ-NEXT:    vpand %ymm4, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vpxor %xmm5, %xmm5, %xmm5
 ; AVX512DQ-NEXT:    vpcmpeqb %ymm5, %ymm1, %ymm6
@@ -596,7 +610,7 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
 ; AVX512CD-NEXT:    vplzcntd %zmm1, %zmm1
 ; AVX512CD-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512CD-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
+; AVX512CD-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
 ; AVX512CD-NEXT:    vpsubb %ymm2, %ymm1, %ymm1
 ; AVX512CD-NEXT:    vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512CD-NEXT:    vplzcntd %zmm3, %zmm3
@@ -637,7 +651,8 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vbroadcasti64x2 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    # zmm2 = mem[0,1,0,1,0,1,0,1]
 ; AVX512BW-NEXT:    vpshufb %zmm1, %zmm2, %zmm3
 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    vptestnmb %zmm1, %zmm1, %k0
@@ -649,10 +664,11 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
 ; AVX512DQ-LABEL: testv64i8u:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm3
 ; AVX512DQ-NEXT:    vpsrlw $4, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512DQ-NEXT:    vpand %ymm4, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vpxor %xmm5, %xmm5, %xmm5
 ; AVX512DQ-NEXT:    vpcmpeqb %ymm5, %ymm1, %ymm6

diff  --git a/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll
index 56daf987c829e..8e8aca15860d5 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll
@@ -107,7 +107,7 @@ define <16 x i8> @ult_2_v16i8(<16 x i8> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; BITALG_NOVLX-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
 ; BITALG_NOVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -115,7 +115,7 @@ define <16 x i8> @ult_2_v16i8(<16 x i8> %0) {
 ; BITALG-LABEL: ult_2_v16i8:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; BITALG-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
 ; BITALG-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0)
@@ -210,7 +210,7 @@ define <16 x i8> @ugt_2_v16i8(<16 x i8> %0) {
 ;
 ; AVX2-LABEL: ugt_2_v16i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -347,7 +347,7 @@ define <16 x i8> @ult_3_v16i8(<16 x i8> %0) {
 ;
 ; AVX2-LABEL: ult_3_v16i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -355,7 +355,7 @@ define <16 x i8> @ult_3_v16i8(<16 x i8> %0) {
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
 ; AVX2-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -364,7 +364,7 @@ define <16 x i8> @ult_3_v16i8(<16 x i8> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -374,7 +374,7 @@ define <16 x i8> @ult_3_v16i8(<16 x i8> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQVL-NEXT:    retq
@@ -383,7 +383,7 @@ define <16 x i8> @ult_3_v16i8(<16 x i8> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; BITALG_NOVLX-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
 ; BITALG_NOVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -391,7 +391,7 @@ define <16 x i8> @ult_3_v16i8(<16 x i8> %0) {
 ; BITALG-LABEL: ult_3_v16i8:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; BITALG-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
 ; BITALG-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0)
@@ -486,7 +486,7 @@ define <16 x i8> @ugt_3_v16i8(<16 x i8> %0) {
 ;
 ; AVX2-LABEL: ugt_3_v16i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -623,7 +623,7 @@ define <16 x i8> @ult_4_v16i8(<16 x i8> %0) {
 ;
 ; AVX2-LABEL: ult_4_v16i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -631,7 +631,7 @@ define <16 x i8> @ult_4_v16i8(<16 x i8> %0) {
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
 ; AVX2-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -640,7 +640,7 @@ define <16 x i8> @ult_4_v16i8(<16 x i8> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -650,7 +650,7 @@ define <16 x i8> @ult_4_v16i8(<16 x i8> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQVL-NEXT:    retq
@@ -659,7 +659,7 @@ define <16 x i8> @ult_4_v16i8(<16 x i8> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; BITALG_NOVLX-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
 ; BITALG_NOVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -667,7 +667,7 @@ define <16 x i8> @ult_4_v16i8(<16 x i8> %0) {
 ; BITALG-LABEL: ult_4_v16i8:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; BITALG-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
 ; BITALG-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0)
@@ -762,7 +762,7 @@ define <16 x i8> @ugt_4_v16i8(<16 x i8> %0) {
 ;
 ; AVX2-LABEL: ugt_4_v16i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -899,7 +899,7 @@ define <16 x i8> @ult_5_v16i8(<16 x i8> %0) {
 ;
 ; AVX2-LABEL: ult_5_v16i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -907,7 +907,7 @@ define <16 x i8> @ult_5_v16i8(<16 x i8> %0) {
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
 ; AVX2-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -916,7 +916,7 @@ define <16 x i8> @ult_5_v16i8(<16 x i8> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -926,7 +926,7 @@ define <16 x i8> @ult_5_v16i8(<16 x i8> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQVL-NEXT:    retq
@@ -935,7 +935,7 @@ define <16 x i8> @ult_5_v16i8(<16 x i8> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; BITALG_NOVLX-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
 ; BITALG_NOVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -943,7 +943,7 @@ define <16 x i8> @ult_5_v16i8(<16 x i8> %0) {
 ; BITALG-LABEL: ult_5_v16i8:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; BITALG-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
 ; BITALG-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0)
@@ -1038,7 +1038,7 @@ define <16 x i8> @ugt_5_v16i8(<16 x i8> %0) {
 ;
 ; AVX2-LABEL: ugt_5_v16i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -1175,7 +1175,7 @@ define <16 x i8> @ult_6_v16i8(<16 x i8> %0) {
 ;
 ; AVX2-LABEL: ult_6_v16i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -1183,7 +1183,7 @@ define <16 x i8> @ult_6_v16i8(<16 x i8> %0) {
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
 ; AVX2-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -1192,7 +1192,7 @@ define <16 x i8> @ult_6_v16i8(<16 x i8> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -1202,7 +1202,7 @@ define <16 x i8> @ult_6_v16i8(<16 x i8> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQVL-NEXT:    retq
@@ -1211,7 +1211,7 @@ define <16 x i8> @ult_6_v16i8(<16 x i8> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; BITALG_NOVLX-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
 ; BITALG_NOVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -1219,7 +1219,7 @@ define <16 x i8> @ult_6_v16i8(<16 x i8> %0) {
 ; BITALG-LABEL: ult_6_v16i8:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; BITALG-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
 ; BITALG-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0)
@@ -1314,7 +1314,7 @@ define <16 x i8> @ugt_6_v16i8(<16 x i8> %0) {
 ;
 ; AVX2-LABEL: ugt_6_v16i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -1451,7 +1451,7 @@ define <16 x i8> @ult_7_v16i8(<16 x i8> %0) {
 ;
 ; AVX2-LABEL: ult_7_v16i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -1459,7 +1459,7 @@ define <16 x i8> @ult_7_v16i8(<16 x i8> %0) {
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; AVX2-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -1468,7 +1468,7 @@ define <16 x i8> @ult_7_v16i8(<16 x i8> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -1478,7 +1478,7 @@ define <16 x i8> @ult_7_v16i8(<16 x i8> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQVL-NEXT:    retq
@@ -1487,7 +1487,7 @@ define <16 x i8> @ult_7_v16i8(<16 x i8> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; BITALG_NOVLX-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; BITALG_NOVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -1495,7 +1495,7 @@ define <16 x i8> @ult_7_v16i8(<16 x i8> %0) {
 ; BITALG-LABEL: ult_7_v16i8:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; BITALG-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; BITALG-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0)
@@ -1600,7 +1600,7 @@ define <8 x i16> @ult_2_v8i16(<8 x i16> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
+; BITALG_NOVLX-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
 ; BITALG_NOVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -1608,7 +1608,7 @@ define <8 x i16> @ult_2_v8i16(<8 x i16> %0) {
 ; BITALG-LABEL: ult_2_v8i16:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntw %xmm0, %xmm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
+; BITALG-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
 ; BITALG-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
@@ -1720,7 +1720,7 @@ define <8 x i16> @ugt_2_v8i16(<8 x i16> %0) {
 ;
 ; AVX2-LABEL: ugt_2_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -1879,7 +1879,7 @@ define <8 x i16> @ult_3_v8i16(<8 x i16> %0) {
 ;
 ; AVX2-LABEL: ult_3_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -1890,7 +1890,7 @@ define <8 x i16> @ult_3_v8i16(<8 x i16> %0) {
 ; AVX2-NEXT:    vpsllw $8, %xmm0, %xmm1
 ; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3]
 ; AVX2-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -1899,7 +1899,7 @@ define <8 x i16> @ult_3_v8i16(<8 x i16> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -1909,7 +1909,7 @@ define <8 x i16> @ult_3_v8i16(<8 x i16> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %ymm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQVL-NEXT:    retq
@@ -1918,7 +1918,7 @@ define <8 x i16> @ult_3_v8i16(<8 x i16> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3]
+; BITALG_NOVLX-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3]
 ; BITALG_NOVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -1926,7 +1926,7 @@ define <8 x i16> @ult_3_v8i16(<8 x i16> %0) {
 ; BITALG-LABEL: ult_3_v8i16:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntw %xmm0, %xmm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3]
+; BITALG-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3]
 ; BITALG-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
@@ -2038,7 +2038,7 @@ define <8 x i16> @ugt_3_v8i16(<8 x i16> %0) {
 ;
 ; AVX2-LABEL: ugt_3_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -2197,7 +2197,7 @@ define <8 x i16> @ult_4_v8i16(<8 x i16> %0) {
 ;
 ; AVX2-LABEL: ult_4_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -2208,7 +2208,7 @@ define <8 x i16> @ult_4_v8i16(<8 x i16> %0) {
 ; AVX2-NEXT:    vpsllw $8, %xmm0, %xmm1
 ; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
 ; AVX2-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -2217,7 +2217,7 @@ define <8 x i16> @ult_4_v8i16(<8 x i16> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -2227,7 +2227,7 @@ define <8 x i16> @ult_4_v8i16(<8 x i16> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %ymm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQVL-NEXT:    retq
@@ -2236,7 +2236,7 @@ define <8 x i16> @ult_4_v8i16(<8 x i16> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
+; BITALG_NOVLX-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
 ; BITALG_NOVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -2244,7 +2244,7 @@ define <8 x i16> @ult_4_v8i16(<8 x i16> %0) {
 ; BITALG-LABEL: ult_4_v8i16:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntw %xmm0, %xmm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
+; BITALG-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
 ; BITALG-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
@@ -2356,7 +2356,7 @@ define <8 x i16> @ugt_4_v8i16(<8 x i16> %0) {
 ;
 ; AVX2-LABEL: ugt_4_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -2515,7 +2515,7 @@ define <8 x i16> @ult_5_v8i16(<8 x i16> %0) {
 ;
 ; AVX2-LABEL: ult_5_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -2526,7 +2526,7 @@ define <8 x i16> @ult_5_v8i16(<8 x i16> %0) {
 ; AVX2-NEXT:    vpsllw $8, %xmm0, %xmm1
 ; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5]
 ; AVX2-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -2535,7 +2535,7 @@ define <8 x i16> @ult_5_v8i16(<8 x i16> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -2545,7 +2545,7 @@ define <8 x i16> @ult_5_v8i16(<8 x i16> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %ymm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQVL-NEXT:    retq
@@ -2554,7 +2554,7 @@ define <8 x i16> @ult_5_v8i16(<8 x i16> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5]
+; BITALG_NOVLX-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5]
 ; BITALG_NOVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -2562,7 +2562,7 @@ define <8 x i16> @ult_5_v8i16(<8 x i16> %0) {
 ; BITALG-LABEL: ult_5_v8i16:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntw %xmm0, %xmm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5]
+; BITALG-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5]
 ; BITALG-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
@@ -2674,7 +2674,7 @@ define <8 x i16> @ugt_5_v8i16(<8 x i16> %0) {
 ;
 ; AVX2-LABEL: ugt_5_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -2833,7 +2833,7 @@ define <8 x i16> @ult_6_v8i16(<8 x i16> %0) {
 ;
 ; AVX2-LABEL: ult_6_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -2844,7 +2844,7 @@ define <8 x i16> @ult_6_v8i16(<8 x i16> %0) {
 ; AVX2-NEXT:    vpsllw $8, %xmm0, %xmm1
 ; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6]
 ; AVX2-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -2853,7 +2853,7 @@ define <8 x i16> @ult_6_v8i16(<8 x i16> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -2863,7 +2863,7 @@ define <8 x i16> @ult_6_v8i16(<8 x i16> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %ymm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQVL-NEXT:    retq
@@ -2872,7 +2872,7 @@ define <8 x i16> @ult_6_v8i16(<8 x i16> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6]
+; BITALG_NOVLX-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6]
 ; BITALG_NOVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -2880,7 +2880,7 @@ define <8 x i16> @ult_6_v8i16(<8 x i16> %0) {
 ; BITALG-LABEL: ult_6_v8i16:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntw %xmm0, %xmm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6]
+; BITALG-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6]
 ; BITALG-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
@@ -2992,7 +2992,7 @@ define <8 x i16> @ugt_6_v8i16(<8 x i16> %0) {
 ;
 ; AVX2-LABEL: ugt_6_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -3151,7 +3151,7 @@ define <8 x i16> @ult_7_v8i16(<8 x i16> %0) {
 ;
 ; AVX2-LABEL: ult_7_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -3162,7 +3162,7 @@ define <8 x i16> @ult_7_v8i16(<8 x i16> %0) {
 ; AVX2-NEXT:    vpsllw $8, %xmm0, %xmm1
 ; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7]
 ; AVX2-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -3171,7 +3171,7 @@ define <8 x i16> @ult_7_v8i16(<8 x i16> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -3181,7 +3181,7 @@ define <8 x i16> @ult_7_v8i16(<8 x i16> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %ymm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQVL-NEXT:    retq
@@ -3190,7 +3190,7 @@ define <8 x i16> @ult_7_v8i16(<8 x i16> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7]
+; BITALG_NOVLX-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7]
 ; BITALG_NOVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -3198,7 +3198,7 @@ define <8 x i16> @ult_7_v8i16(<8 x i16> %0) {
 ; BITALG-LABEL: ult_7_v8i16:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntw %xmm0, %xmm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7]
+; BITALG-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7]
 ; BITALG-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
@@ -3310,7 +3310,7 @@ define <8 x i16> @ugt_7_v8i16(<8 x i16> %0) {
 ;
 ; AVX2-LABEL: ugt_7_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -3469,7 +3469,7 @@ define <8 x i16> @ult_8_v8i16(<8 x i16> %0) {
 ;
 ; AVX2-LABEL: ult_8_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -3480,7 +3480,7 @@ define <8 x i16> @ult_8_v8i16(<8 x i16> %0) {
 ; AVX2-NEXT:    vpsllw $8, %xmm0, %xmm1
 ; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8]
 ; AVX2-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -3489,7 +3489,7 @@ define <8 x i16> @ult_8_v8i16(<8 x i16> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -3499,7 +3499,7 @@ define <8 x i16> @ult_8_v8i16(<8 x i16> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %ymm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQVL-NEXT:    retq
@@ -3508,7 +3508,7 @@ define <8 x i16> @ult_8_v8i16(<8 x i16> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8]
+; BITALG_NOVLX-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8]
 ; BITALG_NOVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -3516,7 +3516,7 @@ define <8 x i16> @ult_8_v8i16(<8 x i16> %0) {
 ; BITALG-LABEL: ult_8_v8i16:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntw %xmm0, %xmm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8]
+; BITALG-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8]
 ; BITALG-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
@@ -3628,7 +3628,7 @@ define <8 x i16> @ugt_8_v8i16(<8 x i16> %0) {
 ;
 ; AVX2-LABEL: ugt_8_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -3787,7 +3787,7 @@ define <8 x i16> @ult_9_v8i16(<8 x i16> %0) {
 ;
 ; AVX2-LABEL: ult_9_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -3798,7 +3798,7 @@ define <8 x i16> @ult_9_v8i16(<8 x i16> %0) {
 ; AVX2-NEXT:    vpsllw $8, %xmm0, %xmm1
 ; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9]
 ; AVX2-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -3807,7 +3807,7 @@ define <8 x i16> @ult_9_v8i16(<8 x i16> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -3817,7 +3817,7 @@ define <8 x i16> @ult_9_v8i16(<8 x i16> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %ymm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQVL-NEXT:    retq
@@ -3826,7 +3826,7 @@ define <8 x i16> @ult_9_v8i16(<8 x i16> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9]
+; BITALG_NOVLX-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9]
 ; BITALG_NOVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -3834,7 +3834,7 @@ define <8 x i16> @ult_9_v8i16(<8 x i16> %0) {
 ; BITALG-LABEL: ult_9_v8i16:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntw %xmm0, %xmm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9]
+; BITALG-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9]
 ; BITALG-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
@@ -3946,7 +3946,7 @@ define <8 x i16> @ugt_9_v8i16(<8 x i16> %0) {
 ;
 ; AVX2-LABEL: ugt_9_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -4105,7 +4105,7 @@ define <8 x i16> @ult_10_v8i16(<8 x i16> %0) {
 ;
 ; AVX2-LABEL: ult_10_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -4116,7 +4116,7 @@ define <8 x i16> @ult_10_v8i16(<8 x i16> %0) {
 ; AVX2-NEXT:    vpsllw $8, %xmm0, %xmm1
 ; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10]
 ; AVX2-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -4125,7 +4125,7 @@ define <8 x i16> @ult_10_v8i16(<8 x i16> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -4135,7 +4135,7 @@ define <8 x i16> @ult_10_v8i16(<8 x i16> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %ymm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQVL-NEXT:    retq
@@ -4144,7 +4144,7 @@ define <8 x i16> @ult_10_v8i16(<8 x i16> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10]
+; BITALG_NOVLX-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10]
 ; BITALG_NOVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -4152,7 +4152,7 @@ define <8 x i16> @ult_10_v8i16(<8 x i16> %0) {
 ; BITALG-LABEL: ult_10_v8i16:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntw %xmm0, %xmm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10]
+; BITALG-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10]
 ; BITALG-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
@@ -4264,7 +4264,7 @@ define <8 x i16> @ugt_10_v8i16(<8 x i16> %0) {
 ;
 ; AVX2-LABEL: ugt_10_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -4423,7 +4423,7 @@ define <8 x i16> @ult_11_v8i16(<8 x i16> %0) {
 ;
 ; AVX2-LABEL: ult_11_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -4434,7 +4434,7 @@ define <8 x i16> @ult_11_v8i16(<8 x i16> %0) {
 ; AVX2-NEXT:    vpsllw $8, %xmm0, %xmm1
 ; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11]
 ; AVX2-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -4443,7 +4443,7 @@ define <8 x i16> @ult_11_v8i16(<8 x i16> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -4453,7 +4453,7 @@ define <8 x i16> @ult_11_v8i16(<8 x i16> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %ymm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQVL-NEXT:    retq
@@ -4462,7 +4462,7 @@ define <8 x i16> @ult_11_v8i16(<8 x i16> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11]
+; BITALG_NOVLX-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11]
 ; BITALG_NOVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -4470,7 +4470,7 @@ define <8 x i16> @ult_11_v8i16(<8 x i16> %0) {
 ; BITALG-LABEL: ult_11_v8i16:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntw %xmm0, %xmm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11]
+; BITALG-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11]
 ; BITALG-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
@@ -4582,7 +4582,7 @@ define <8 x i16> @ugt_11_v8i16(<8 x i16> %0) {
 ;
 ; AVX2-LABEL: ugt_11_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -4741,7 +4741,7 @@ define <8 x i16> @ult_12_v8i16(<8 x i16> %0) {
 ;
 ; AVX2-LABEL: ult_12_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -4752,7 +4752,7 @@ define <8 x i16> @ult_12_v8i16(<8 x i16> %0) {
 ; AVX2-NEXT:    vpsllw $8, %xmm0, %xmm1
 ; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12]
 ; AVX2-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -4761,7 +4761,7 @@ define <8 x i16> @ult_12_v8i16(<8 x i16> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -4771,7 +4771,7 @@ define <8 x i16> @ult_12_v8i16(<8 x i16> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %ymm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQVL-NEXT:    retq
@@ -4780,7 +4780,7 @@ define <8 x i16> @ult_12_v8i16(<8 x i16> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12]
+; BITALG_NOVLX-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12]
 ; BITALG_NOVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -4788,7 +4788,7 @@ define <8 x i16> @ult_12_v8i16(<8 x i16> %0) {
 ; BITALG-LABEL: ult_12_v8i16:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntw %xmm0, %xmm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12]
+; BITALG-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12]
 ; BITALG-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
@@ -4900,7 +4900,7 @@ define <8 x i16> @ugt_12_v8i16(<8 x i16> %0) {
 ;
 ; AVX2-LABEL: ugt_12_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -5059,7 +5059,7 @@ define <8 x i16> @ult_13_v8i16(<8 x i16> %0) {
 ;
 ; AVX2-LABEL: ult_13_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -5070,7 +5070,7 @@ define <8 x i16> @ult_13_v8i16(<8 x i16> %0) {
 ; AVX2-NEXT:    vpsllw $8, %xmm0, %xmm1
 ; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13]
 ; AVX2-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -5079,7 +5079,7 @@ define <8 x i16> @ult_13_v8i16(<8 x i16> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -5089,7 +5089,7 @@ define <8 x i16> @ult_13_v8i16(<8 x i16> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %ymm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQVL-NEXT:    retq
@@ -5098,7 +5098,7 @@ define <8 x i16> @ult_13_v8i16(<8 x i16> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13]
+; BITALG_NOVLX-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13]
 ; BITALG_NOVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -5106,7 +5106,7 @@ define <8 x i16> @ult_13_v8i16(<8 x i16> %0) {
 ; BITALG-LABEL: ult_13_v8i16:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntw %xmm0, %xmm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13]
+; BITALG-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13]
 ; BITALG-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
@@ -5218,7 +5218,7 @@ define <8 x i16> @ugt_13_v8i16(<8 x i16> %0) {
 ;
 ; AVX2-LABEL: ugt_13_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -5377,7 +5377,7 @@ define <8 x i16> @ult_14_v8i16(<8 x i16> %0) {
 ;
 ; AVX2-LABEL: ult_14_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -5388,7 +5388,7 @@ define <8 x i16> @ult_14_v8i16(<8 x i16> %0) {
 ; AVX2-NEXT:    vpsllw $8, %xmm0, %xmm1
 ; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14]
 ; AVX2-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -5397,7 +5397,7 @@ define <8 x i16> @ult_14_v8i16(<8 x i16> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -5407,7 +5407,7 @@ define <8 x i16> @ult_14_v8i16(<8 x i16> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %ymm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQVL-NEXT:    retq
@@ -5416,7 +5416,7 @@ define <8 x i16> @ult_14_v8i16(<8 x i16> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14]
+; BITALG_NOVLX-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14]
 ; BITALG_NOVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -5424,7 +5424,7 @@ define <8 x i16> @ult_14_v8i16(<8 x i16> %0) {
 ; BITALG-LABEL: ult_14_v8i16:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntw %xmm0, %xmm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14]
+; BITALG-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14]
 ; BITALG-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
@@ -5536,7 +5536,7 @@ define <8 x i16> @ugt_14_v8i16(<8 x i16> %0) {
 ;
 ; AVX2-LABEL: ugt_14_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -5695,7 +5695,7 @@ define <8 x i16> @ult_15_v8i16(<8 x i16> %0) {
 ;
 ; AVX2-LABEL: ult_15_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -5706,7 +5706,7 @@ define <8 x i16> @ult_15_v8i16(<8 x i16> %0) {
 ; AVX2-NEXT:    vpsllw $8, %xmm0, %xmm1
 ; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -5715,7 +5715,7 @@ define <8 x i16> @ult_15_v8i16(<8 x i16> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -5725,7 +5725,7 @@ define <8 x i16> @ult_15_v8i16(<8 x i16> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %ymm0, %xmm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQVL-NEXT:    retq
@@ -5734,7 +5734,7 @@ define <8 x i16> @ult_15_v8i16(<8 x i16> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15]
+; BITALG_NOVLX-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15]
 ; BITALG_NOVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -5742,7 +5742,7 @@ define <8 x i16> @ult_15_v8i16(<8 x i16> %0) {
 ; BITALG-LABEL: ult_15_v8i16:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntw %xmm0, %xmm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15]
+; BITALG-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15]
 ; BITALG-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; BITALG-NEXT:    retq
   %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
@@ -6011,7 +6011,7 @@ define <4 x i32> @ugt_2_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_2_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -6200,7 +6200,7 @@ define <4 x i32> @ult_3_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_3_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -6385,7 +6385,7 @@ define <4 x i32> @ugt_3_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_3_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -6574,7 +6574,7 @@ define <4 x i32> @ult_4_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_4_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -6759,7 +6759,7 @@ define <4 x i32> @ugt_4_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_4_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -6948,7 +6948,7 @@ define <4 x i32> @ult_5_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_5_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -7133,7 +7133,7 @@ define <4 x i32> @ugt_5_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_5_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -7322,7 +7322,7 @@ define <4 x i32> @ult_6_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_6_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -7507,7 +7507,7 @@ define <4 x i32> @ugt_6_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_6_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -7696,7 +7696,7 @@ define <4 x i32> @ult_7_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_7_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -7881,7 +7881,7 @@ define <4 x i32> @ugt_7_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_7_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -8070,7 +8070,7 @@ define <4 x i32> @ult_8_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_8_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -8255,7 +8255,7 @@ define <4 x i32> @ugt_8_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_8_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -8444,7 +8444,7 @@ define <4 x i32> @ult_9_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_9_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -8629,7 +8629,7 @@ define <4 x i32> @ugt_9_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_9_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -8818,7 +8818,7 @@ define <4 x i32> @ult_10_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_10_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -9003,7 +9003,7 @@ define <4 x i32> @ugt_10_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_10_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -9192,7 +9192,7 @@ define <4 x i32> @ult_11_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_11_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -9377,7 +9377,7 @@ define <4 x i32> @ugt_11_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_11_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -9566,7 +9566,7 @@ define <4 x i32> @ult_12_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_12_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -9751,7 +9751,7 @@ define <4 x i32> @ugt_12_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_12_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -9940,7 +9940,7 @@ define <4 x i32> @ult_13_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_13_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -10125,7 +10125,7 @@ define <4 x i32> @ugt_13_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_13_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -10314,7 +10314,7 @@ define <4 x i32> @ult_14_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_14_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -10499,7 +10499,7 @@ define <4 x i32> @ugt_14_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_14_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -10688,7 +10688,7 @@ define <4 x i32> @ult_15_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_15_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -10873,7 +10873,7 @@ define <4 x i32> @ugt_15_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_15_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -11062,7 +11062,7 @@ define <4 x i32> @ult_16_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_16_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -11247,7 +11247,7 @@ define <4 x i32> @ugt_16_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_16_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -11436,7 +11436,7 @@ define <4 x i32> @ult_17_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_17_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -11621,7 +11621,7 @@ define <4 x i32> @ugt_17_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_17_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -11810,7 +11810,7 @@ define <4 x i32> @ult_18_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_18_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -11995,7 +11995,7 @@ define <4 x i32> @ugt_18_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_18_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -12184,7 +12184,7 @@ define <4 x i32> @ult_19_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_19_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -12369,7 +12369,7 @@ define <4 x i32> @ugt_19_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_19_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -12558,7 +12558,7 @@ define <4 x i32> @ult_20_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_20_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -12743,7 +12743,7 @@ define <4 x i32> @ugt_20_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_20_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -12932,7 +12932,7 @@ define <4 x i32> @ult_21_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_21_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -13117,7 +13117,7 @@ define <4 x i32> @ugt_21_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_21_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -13306,7 +13306,7 @@ define <4 x i32> @ult_22_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_22_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -13491,7 +13491,7 @@ define <4 x i32> @ugt_22_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_22_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -13680,7 +13680,7 @@ define <4 x i32> @ult_23_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_23_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -13865,7 +13865,7 @@ define <4 x i32> @ugt_23_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_23_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -14054,7 +14054,7 @@ define <4 x i32> @ult_24_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_24_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -14239,7 +14239,7 @@ define <4 x i32> @ugt_24_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_24_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -14428,7 +14428,7 @@ define <4 x i32> @ult_25_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_25_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -14613,7 +14613,7 @@ define <4 x i32> @ugt_25_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_25_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -14802,7 +14802,7 @@ define <4 x i32> @ult_26_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_26_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -14987,7 +14987,7 @@ define <4 x i32> @ugt_26_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_26_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -15176,7 +15176,7 @@ define <4 x i32> @ult_27_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_27_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -15361,7 +15361,7 @@ define <4 x i32> @ugt_27_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_27_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -15550,7 +15550,7 @@ define <4 x i32> @ult_28_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_28_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -15735,7 +15735,7 @@ define <4 x i32> @ugt_28_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_28_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -15924,7 +15924,7 @@ define <4 x i32> @ult_29_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_29_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -16109,7 +16109,7 @@ define <4 x i32> @ugt_29_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_29_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -16298,7 +16298,7 @@ define <4 x i32> @ult_30_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_30_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -16483,7 +16483,7 @@ define <4 x i32> @ugt_30_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_30_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -16672,7 +16672,7 @@ define <4 x i32> @ult_31_v4i32(<4 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_31_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -16917,7 +16917,7 @@ define <2 x i64> @ult_2_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,2]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2,2]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -17054,7 +17054,7 @@ define <2 x i64> @ugt_2_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_2_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -17213,7 +17213,7 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_3_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -17223,7 +17223,7 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [3,3]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [3,3]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -17231,7 +17231,7 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [3,3]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [3,3]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -17249,7 +17249,7 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [3,3]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [3,3]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -17370,7 +17370,7 @@ define <2 x i64> @ugt_3_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_3_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -17529,7 +17529,7 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_4_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -17539,7 +17539,7 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,4]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4,4]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -17547,7 +17547,7 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,4]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4,4]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -17565,7 +17565,7 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,4]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4,4]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -17686,7 +17686,7 @@ define <2 x i64> @ugt_4_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_4_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -17845,7 +17845,7 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_5_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -17855,7 +17855,7 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [5,5]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [5,5]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -17863,7 +17863,7 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [5,5]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [5,5]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -17881,7 +17881,7 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [5,5]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [5,5]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -18002,7 +18002,7 @@ define <2 x i64> @ugt_5_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_5_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -18161,7 +18161,7 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_6_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -18171,7 +18171,7 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,6]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [6,6]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -18179,7 +18179,7 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,6]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [6,6]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -18197,7 +18197,7 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,6]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [6,6]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -18318,7 +18318,7 @@ define <2 x i64> @ugt_6_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_6_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -18477,7 +18477,7 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_7_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -18487,7 +18487,7 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [7,7]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [7,7]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -18495,7 +18495,7 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [7,7]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [7,7]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -18513,7 +18513,7 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [7,7]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [7,7]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -18634,7 +18634,7 @@ define <2 x i64> @ugt_7_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_7_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -18793,7 +18793,7 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_8_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -18803,7 +18803,7 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,8]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [8,8]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -18811,7 +18811,7 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,8]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [8,8]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -18829,7 +18829,7 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,8]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [8,8]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -18950,7 +18950,7 @@ define <2 x i64> @ugt_8_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_8_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -19109,7 +19109,7 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_9_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -19119,7 +19119,7 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [9,9]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [9,9]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -19127,7 +19127,7 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [9,9]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [9,9]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -19145,7 +19145,7 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [9,9]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [9,9]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -19266,7 +19266,7 @@ define <2 x i64> @ugt_9_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_9_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -19425,7 +19425,7 @@ define <2 x i64> @ult_10_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_10_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -19435,7 +19435,7 @@ define <2 x i64> @ult_10_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [10,10]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [10,10]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -19443,7 +19443,7 @@ define <2 x i64> @ult_10_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [10,10]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [10,10]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -19461,7 +19461,7 @@ define <2 x i64> @ult_10_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [10,10]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [10,10]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -19582,7 +19582,7 @@ define <2 x i64> @ugt_10_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_10_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -19741,7 +19741,7 @@ define <2 x i64> @ult_11_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_11_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -19751,7 +19751,7 @@ define <2 x i64> @ult_11_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [11,11]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [11,11]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -19759,7 +19759,7 @@ define <2 x i64> @ult_11_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [11,11]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [11,11]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -19777,7 +19777,7 @@ define <2 x i64> @ult_11_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [11,11]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [11,11]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -19898,7 +19898,7 @@ define <2 x i64> @ugt_11_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_11_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -20057,7 +20057,7 @@ define <2 x i64> @ult_12_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_12_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -20067,7 +20067,7 @@ define <2 x i64> @ult_12_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [12,12]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [12,12]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -20075,7 +20075,7 @@ define <2 x i64> @ult_12_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [12,12]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [12,12]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -20093,7 +20093,7 @@ define <2 x i64> @ult_12_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [12,12]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [12,12]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -20214,7 +20214,7 @@ define <2 x i64> @ugt_12_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_12_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -20373,7 +20373,7 @@ define <2 x i64> @ult_13_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_13_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -20383,7 +20383,7 @@ define <2 x i64> @ult_13_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [13,13]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [13,13]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -20391,7 +20391,7 @@ define <2 x i64> @ult_13_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [13,13]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [13,13]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -20409,7 +20409,7 @@ define <2 x i64> @ult_13_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [13,13]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [13,13]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -20530,7 +20530,7 @@ define <2 x i64> @ugt_13_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_13_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -20689,7 +20689,7 @@ define <2 x i64> @ult_14_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_14_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -20699,7 +20699,7 @@ define <2 x i64> @ult_14_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [14,14]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [14,14]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -20707,7 +20707,7 @@ define <2 x i64> @ult_14_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [14,14]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [14,14]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -20725,7 +20725,7 @@ define <2 x i64> @ult_14_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [14,14]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [14,14]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -20846,7 +20846,7 @@ define <2 x i64> @ugt_14_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_14_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -21005,7 +21005,7 @@ define <2 x i64> @ult_15_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_15_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -21015,7 +21015,7 @@ define <2 x i64> @ult_15_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [15,15]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -21023,7 +21023,7 @@ define <2 x i64> @ult_15_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [15,15]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -21041,7 +21041,7 @@ define <2 x i64> @ult_15_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [15,15]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -21162,7 +21162,7 @@ define <2 x i64> @ugt_15_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_15_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -21321,7 +21321,7 @@ define <2 x i64> @ult_16_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_16_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -21331,7 +21331,7 @@ define <2 x i64> @ult_16_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [16,16]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -21339,7 +21339,7 @@ define <2 x i64> @ult_16_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [16,16]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -21357,7 +21357,7 @@ define <2 x i64> @ult_16_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [16,16]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -21478,7 +21478,7 @@ define <2 x i64> @ugt_16_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_16_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -21637,7 +21637,7 @@ define <2 x i64> @ult_17_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_17_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -21647,7 +21647,7 @@ define <2 x i64> @ult_17_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [17,17]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [17,17]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -21655,7 +21655,7 @@ define <2 x i64> @ult_17_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [17,17]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [17,17]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -21673,7 +21673,7 @@ define <2 x i64> @ult_17_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [17,17]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [17,17]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -21794,7 +21794,7 @@ define <2 x i64> @ugt_17_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_17_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -21953,7 +21953,7 @@ define <2 x i64> @ult_18_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_18_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -21963,7 +21963,7 @@ define <2 x i64> @ult_18_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [18,18]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [18,18]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -21971,7 +21971,7 @@ define <2 x i64> @ult_18_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [18,18]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [18,18]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -21989,7 +21989,7 @@ define <2 x i64> @ult_18_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [18,18]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [18,18]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -22110,7 +22110,7 @@ define <2 x i64> @ugt_18_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_18_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -22269,7 +22269,7 @@ define <2 x i64> @ult_19_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_19_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -22279,7 +22279,7 @@ define <2 x i64> @ult_19_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [19,19]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [19,19]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -22287,7 +22287,7 @@ define <2 x i64> @ult_19_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [19,19]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [19,19]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -22305,7 +22305,7 @@ define <2 x i64> @ult_19_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [19,19]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [19,19]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -22426,7 +22426,7 @@ define <2 x i64> @ugt_19_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_19_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -22585,7 +22585,7 @@ define <2 x i64> @ult_20_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_20_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -22595,7 +22595,7 @@ define <2 x i64> @ult_20_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [20,20]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [20,20]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -22603,7 +22603,7 @@ define <2 x i64> @ult_20_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [20,20]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [20,20]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -22621,7 +22621,7 @@ define <2 x i64> @ult_20_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [20,20]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [20,20]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -22742,7 +22742,7 @@ define <2 x i64> @ugt_20_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_20_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -22901,7 +22901,7 @@ define <2 x i64> @ult_21_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_21_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -22911,7 +22911,7 @@ define <2 x i64> @ult_21_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [21,21]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [21,21]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -22919,7 +22919,7 @@ define <2 x i64> @ult_21_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [21,21]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [21,21]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -22937,7 +22937,7 @@ define <2 x i64> @ult_21_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [21,21]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [21,21]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -23058,7 +23058,7 @@ define <2 x i64> @ugt_21_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_21_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -23217,7 +23217,7 @@ define <2 x i64> @ult_22_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_22_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -23227,7 +23227,7 @@ define <2 x i64> @ult_22_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [22,22]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [22,22]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -23235,7 +23235,7 @@ define <2 x i64> @ult_22_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [22,22]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [22,22]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -23253,7 +23253,7 @@ define <2 x i64> @ult_22_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [22,22]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [22,22]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -23374,7 +23374,7 @@ define <2 x i64> @ugt_22_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_22_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -23533,7 +23533,7 @@ define <2 x i64> @ult_23_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_23_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -23543,7 +23543,7 @@ define <2 x i64> @ult_23_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [23,23]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [23,23]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -23551,7 +23551,7 @@ define <2 x i64> @ult_23_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [23,23]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [23,23]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -23569,7 +23569,7 @@ define <2 x i64> @ult_23_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [23,23]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [23,23]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -23690,7 +23690,7 @@ define <2 x i64> @ugt_23_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_23_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -23849,7 +23849,7 @@ define <2 x i64> @ult_24_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_24_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -23859,7 +23859,7 @@ define <2 x i64> @ult_24_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [24,24]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [24,24]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -23867,7 +23867,7 @@ define <2 x i64> @ult_24_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [24,24]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [24,24]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -23885,7 +23885,7 @@ define <2 x i64> @ult_24_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [24,24]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [24,24]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -24006,7 +24006,7 @@ define <2 x i64> @ugt_24_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_24_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -24165,7 +24165,7 @@ define <2 x i64> @ult_25_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_25_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -24175,7 +24175,7 @@ define <2 x i64> @ult_25_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [25,25]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [25,25]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -24183,7 +24183,7 @@ define <2 x i64> @ult_25_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [25,25]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [25,25]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -24201,7 +24201,7 @@ define <2 x i64> @ult_25_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [25,25]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [25,25]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -24322,7 +24322,7 @@ define <2 x i64> @ugt_25_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_25_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -24481,7 +24481,7 @@ define <2 x i64> @ult_26_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_26_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -24491,7 +24491,7 @@ define <2 x i64> @ult_26_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [26,26]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [26,26]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -24499,7 +24499,7 @@ define <2 x i64> @ult_26_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [26,26]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [26,26]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -24517,7 +24517,7 @@ define <2 x i64> @ult_26_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [26,26]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [26,26]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -24638,7 +24638,7 @@ define <2 x i64> @ugt_26_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_26_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -24797,7 +24797,7 @@ define <2 x i64> @ult_27_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_27_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -24807,7 +24807,7 @@ define <2 x i64> @ult_27_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [27,27]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [27,27]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -24815,7 +24815,7 @@ define <2 x i64> @ult_27_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [27,27]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [27,27]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -24833,7 +24833,7 @@ define <2 x i64> @ult_27_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [27,27]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [27,27]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -24954,7 +24954,7 @@ define <2 x i64> @ugt_27_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_27_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -25113,7 +25113,7 @@ define <2 x i64> @ult_28_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_28_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -25123,7 +25123,7 @@ define <2 x i64> @ult_28_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [28,28]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [28,28]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -25131,7 +25131,7 @@ define <2 x i64> @ult_28_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [28,28]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [28,28]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -25149,7 +25149,7 @@ define <2 x i64> @ult_28_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [28,28]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [28,28]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -25270,7 +25270,7 @@ define <2 x i64> @ugt_28_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_28_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -25429,7 +25429,7 @@ define <2 x i64> @ult_29_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_29_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -25439,7 +25439,7 @@ define <2 x i64> @ult_29_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [29,29]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [29,29]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -25447,7 +25447,7 @@ define <2 x i64> @ult_29_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [29,29]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [29,29]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -25465,7 +25465,7 @@ define <2 x i64> @ult_29_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [29,29]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [29,29]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -25586,7 +25586,7 @@ define <2 x i64> @ugt_29_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_29_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -25745,7 +25745,7 @@ define <2 x i64> @ult_30_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_30_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -25755,7 +25755,7 @@ define <2 x i64> @ult_30_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [30,30]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [30,30]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -25763,7 +25763,7 @@ define <2 x i64> @ult_30_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [30,30]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [30,30]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -25781,7 +25781,7 @@ define <2 x i64> @ult_30_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [30,30]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [30,30]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -25902,7 +25902,7 @@ define <2 x i64> @ugt_30_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_30_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -26061,7 +26061,7 @@ define <2 x i64> @ult_31_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_31_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -26071,7 +26071,7 @@ define <2 x i64> @ult_31_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [31,31]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [31,31]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -26079,7 +26079,7 @@ define <2 x i64> @ult_31_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [31,31]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [31,31]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -26097,7 +26097,7 @@ define <2 x i64> @ult_31_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [31,31]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [31,31]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -26218,7 +26218,7 @@ define <2 x i64> @ugt_31_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_31_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -26377,7 +26377,7 @@ define <2 x i64> @ult_32_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_32_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -26387,7 +26387,7 @@ define <2 x i64> @ult_32_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [32,32]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [32,32]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -26395,7 +26395,7 @@ define <2 x i64> @ult_32_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [32,32]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [32,32]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -26413,7 +26413,7 @@ define <2 x i64> @ult_32_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [32,32]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [32,32]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -26534,7 +26534,7 @@ define <2 x i64> @ugt_32_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_32_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -26693,7 +26693,7 @@ define <2 x i64> @ult_33_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_33_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -26703,7 +26703,7 @@ define <2 x i64> @ult_33_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [33,33]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [33,33]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -26711,7 +26711,7 @@ define <2 x i64> @ult_33_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [33,33]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [33,33]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -26729,7 +26729,7 @@ define <2 x i64> @ult_33_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [33,33]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [33,33]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -26850,7 +26850,7 @@ define <2 x i64> @ugt_33_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_33_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -27009,7 +27009,7 @@ define <2 x i64> @ult_34_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_34_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -27019,7 +27019,7 @@ define <2 x i64> @ult_34_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [34,34]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [34,34]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -27027,7 +27027,7 @@ define <2 x i64> @ult_34_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [34,34]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [34,34]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -27045,7 +27045,7 @@ define <2 x i64> @ult_34_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [34,34]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [34,34]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -27166,7 +27166,7 @@ define <2 x i64> @ugt_34_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_34_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -27325,7 +27325,7 @@ define <2 x i64> @ult_35_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_35_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -27335,7 +27335,7 @@ define <2 x i64> @ult_35_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [35,35]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [35,35]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -27343,7 +27343,7 @@ define <2 x i64> @ult_35_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [35,35]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [35,35]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -27361,7 +27361,7 @@ define <2 x i64> @ult_35_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [35,35]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [35,35]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -27482,7 +27482,7 @@ define <2 x i64> @ugt_35_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_35_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -27641,7 +27641,7 @@ define <2 x i64> @ult_36_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_36_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -27651,7 +27651,7 @@ define <2 x i64> @ult_36_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [36,36]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [36,36]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -27659,7 +27659,7 @@ define <2 x i64> @ult_36_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [36,36]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [36,36]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -27677,7 +27677,7 @@ define <2 x i64> @ult_36_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [36,36]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [36,36]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -27798,7 +27798,7 @@ define <2 x i64> @ugt_36_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_36_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -27957,7 +27957,7 @@ define <2 x i64> @ult_37_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_37_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -27967,7 +27967,7 @@ define <2 x i64> @ult_37_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [37,37]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [37,37]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -27975,7 +27975,7 @@ define <2 x i64> @ult_37_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [37,37]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [37,37]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -27993,7 +27993,7 @@ define <2 x i64> @ult_37_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [37,37]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [37,37]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -28114,7 +28114,7 @@ define <2 x i64> @ugt_37_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_37_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -28273,7 +28273,7 @@ define <2 x i64> @ult_38_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_38_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -28283,7 +28283,7 @@ define <2 x i64> @ult_38_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [38,38]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [38,38]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -28291,7 +28291,7 @@ define <2 x i64> @ult_38_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [38,38]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [38,38]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -28309,7 +28309,7 @@ define <2 x i64> @ult_38_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [38,38]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [38,38]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -28430,7 +28430,7 @@ define <2 x i64> @ugt_38_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_38_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -28589,7 +28589,7 @@ define <2 x i64> @ult_39_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_39_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -28599,7 +28599,7 @@ define <2 x i64> @ult_39_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [39,39]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [39,39]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -28607,7 +28607,7 @@ define <2 x i64> @ult_39_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [39,39]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [39,39]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -28625,7 +28625,7 @@ define <2 x i64> @ult_39_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [39,39]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [39,39]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -28746,7 +28746,7 @@ define <2 x i64> @ugt_39_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_39_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -28905,7 +28905,7 @@ define <2 x i64> @ult_40_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_40_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -28915,7 +28915,7 @@ define <2 x i64> @ult_40_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [40,40]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [40,40]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -28923,7 +28923,7 @@ define <2 x i64> @ult_40_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [40,40]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [40,40]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -28941,7 +28941,7 @@ define <2 x i64> @ult_40_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [40,40]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [40,40]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -29062,7 +29062,7 @@ define <2 x i64> @ugt_40_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_40_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -29221,7 +29221,7 @@ define <2 x i64> @ult_41_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_41_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -29231,7 +29231,7 @@ define <2 x i64> @ult_41_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [41,41]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [41,41]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -29239,7 +29239,7 @@ define <2 x i64> @ult_41_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [41,41]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [41,41]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -29257,7 +29257,7 @@ define <2 x i64> @ult_41_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [41,41]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [41,41]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -29378,7 +29378,7 @@ define <2 x i64> @ugt_41_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_41_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -29537,7 +29537,7 @@ define <2 x i64> @ult_42_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_42_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -29547,7 +29547,7 @@ define <2 x i64> @ult_42_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [42,42]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [42,42]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -29555,7 +29555,7 @@ define <2 x i64> @ult_42_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [42,42]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [42,42]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -29573,7 +29573,7 @@ define <2 x i64> @ult_42_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [42,42]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [42,42]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -29694,7 +29694,7 @@ define <2 x i64> @ugt_42_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_42_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -29853,7 +29853,7 @@ define <2 x i64> @ult_43_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_43_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -29863,7 +29863,7 @@ define <2 x i64> @ult_43_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [43,43]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [43,43]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -29871,7 +29871,7 @@ define <2 x i64> @ult_43_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [43,43]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [43,43]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -29889,7 +29889,7 @@ define <2 x i64> @ult_43_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [43,43]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [43,43]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -30010,7 +30010,7 @@ define <2 x i64> @ugt_43_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_43_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -30169,7 +30169,7 @@ define <2 x i64> @ult_44_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_44_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -30179,7 +30179,7 @@ define <2 x i64> @ult_44_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [44,44]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [44,44]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -30187,7 +30187,7 @@ define <2 x i64> @ult_44_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [44,44]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [44,44]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -30205,7 +30205,7 @@ define <2 x i64> @ult_44_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [44,44]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [44,44]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -30326,7 +30326,7 @@ define <2 x i64> @ugt_44_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_44_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -30485,7 +30485,7 @@ define <2 x i64> @ult_45_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_45_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -30495,7 +30495,7 @@ define <2 x i64> @ult_45_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [45,45]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [45,45]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -30503,7 +30503,7 @@ define <2 x i64> @ult_45_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [45,45]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [45,45]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -30521,7 +30521,7 @@ define <2 x i64> @ult_45_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [45,45]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [45,45]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -30642,7 +30642,7 @@ define <2 x i64> @ugt_45_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_45_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -30801,7 +30801,7 @@ define <2 x i64> @ult_46_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_46_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -30811,7 +30811,7 @@ define <2 x i64> @ult_46_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [46,46]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [46,46]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -30819,7 +30819,7 @@ define <2 x i64> @ult_46_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [46,46]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [46,46]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -30837,7 +30837,7 @@ define <2 x i64> @ult_46_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [46,46]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [46,46]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -30958,7 +30958,7 @@ define <2 x i64> @ugt_46_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_46_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -31117,7 +31117,7 @@ define <2 x i64> @ult_47_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_47_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -31127,7 +31127,7 @@ define <2 x i64> @ult_47_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [47,47]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [47,47]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -31135,7 +31135,7 @@ define <2 x i64> @ult_47_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [47,47]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [47,47]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -31153,7 +31153,7 @@ define <2 x i64> @ult_47_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [47,47]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [47,47]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -31274,7 +31274,7 @@ define <2 x i64> @ugt_47_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_47_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -31433,7 +31433,7 @@ define <2 x i64> @ult_48_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_48_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -31443,7 +31443,7 @@ define <2 x i64> @ult_48_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [48,48]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [48,48]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -31451,7 +31451,7 @@ define <2 x i64> @ult_48_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [48,48]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [48,48]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -31469,7 +31469,7 @@ define <2 x i64> @ult_48_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [48,48]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [48,48]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -31590,7 +31590,7 @@ define <2 x i64> @ugt_48_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_48_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -31749,7 +31749,7 @@ define <2 x i64> @ult_49_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_49_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -31759,7 +31759,7 @@ define <2 x i64> @ult_49_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [49,49]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [49,49]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -31767,7 +31767,7 @@ define <2 x i64> @ult_49_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [49,49]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [49,49]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -31785,7 +31785,7 @@ define <2 x i64> @ult_49_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [49,49]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [49,49]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -31906,7 +31906,7 @@ define <2 x i64> @ugt_49_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_49_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -32065,7 +32065,7 @@ define <2 x i64> @ult_50_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_50_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -32075,7 +32075,7 @@ define <2 x i64> @ult_50_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [50,50]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [50,50]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -32083,7 +32083,7 @@ define <2 x i64> @ult_50_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [50,50]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [50,50]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -32101,7 +32101,7 @@ define <2 x i64> @ult_50_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [50,50]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [50,50]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -32222,7 +32222,7 @@ define <2 x i64> @ugt_50_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_50_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -32381,7 +32381,7 @@ define <2 x i64> @ult_51_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_51_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -32391,7 +32391,7 @@ define <2 x i64> @ult_51_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [51,51]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [51,51]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -32399,7 +32399,7 @@ define <2 x i64> @ult_51_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [51,51]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [51,51]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -32417,7 +32417,7 @@ define <2 x i64> @ult_51_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [51,51]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [51,51]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -32538,7 +32538,7 @@ define <2 x i64> @ugt_51_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_51_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -32697,7 +32697,7 @@ define <2 x i64> @ult_52_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_52_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -32707,7 +32707,7 @@ define <2 x i64> @ult_52_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [52,52]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [52,52]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -32715,7 +32715,7 @@ define <2 x i64> @ult_52_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [52,52]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [52,52]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -32733,7 +32733,7 @@ define <2 x i64> @ult_52_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [52,52]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [52,52]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -32854,7 +32854,7 @@ define <2 x i64> @ugt_52_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_52_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -33013,7 +33013,7 @@ define <2 x i64> @ult_53_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_53_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -33023,7 +33023,7 @@ define <2 x i64> @ult_53_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [53,53]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [53,53]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -33031,7 +33031,7 @@ define <2 x i64> @ult_53_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [53,53]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [53,53]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -33049,7 +33049,7 @@ define <2 x i64> @ult_53_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [53,53]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [53,53]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -33170,7 +33170,7 @@ define <2 x i64> @ugt_53_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_53_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -33329,7 +33329,7 @@ define <2 x i64> @ult_54_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_54_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -33339,7 +33339,7 @@ define <2 x i64> @ult_54_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [54,54]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [54,54]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -33347,7 +33347,7 @@ define <2 x i64> @ult_54_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [54,54]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [54,54]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -33365,7 +33365,7 @@ define <2 x i64> @ult_54_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [54,54]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [54,54]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -33486,7 +33486,7 @@ define <2 x i64> @ugt_54_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_54_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -33645,7 +33645,7 @@ define <2 x i64> @ult_55_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_55_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -33655,7 +33655,7 @@ define <2 x i64> @ult_55_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [55,55]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [55,55]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -33663,7 +33663,7 @@ define <2 x i64> @ult_55_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [55,55]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [55,55]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -33681,7 +33681,7 @@ define <2 x i64> @ult_55_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [55,55]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [55,55]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -33802,7 +33802,7 @@ define <2 x i64> @ugt_55_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_55_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -33961,7 +33961,7 @@ define <2 x i64> @ult_56_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_56_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -33971,7 +33971,7 @@ define <2 x i64> @ult_56_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [56,56]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [56,56]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -33979,7 +33979,7 @@ define <2 x i64> @ult_56_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [56,56]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [56,56]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -33997,7 +33997,7 @@ define <2 x i64> @ult_56_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [56,56]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [56,56]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -34118,7 +34118,7 @@ define <2 x i64> @ugt_56_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_56_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -34277,7 +34277,7 @@ define <2 x i64> @ult_57_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_57_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -34287,7 +34287,7 @@ define <2 x i64> @ult_57_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [57,57]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [57,57]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -34295,7 +34295,7 @@ define <2 x i64> @ult_57_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [57,57]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [57,57]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -34313,7 +34313,7 @@ define <2 x i64> @ult_57_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [57,57]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [57,57]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -34434,7 +34434,7 @@ define <2 x i64> @ugt_57_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_57_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -34593,7 +34593,7 @@ define <2 x i64> @ult_58_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_58_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -34603,7 +34603,7 @@ define <2 x i64> @ult_58_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [58,58]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [58,58]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -34611,7 +34611,7 @@ define <2 x i64> @ult_58_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [58,58]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [58,58]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -34629,7 +34629,7 @@ define <2 x i64> @ult_58_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [58,58]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [58,58]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -34750,7 +34750,7 @@ define <2 x i64> @ugt_58_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_58_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -34909,7 +34909,7 @@ define <2 x i64> @ult_59_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_59_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -34919,7 +34919,7 @@ define <2 x i64> @ult_59_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [59,59]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [59,59]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -34927,7 +34927,7 @@ define <2 x i64> @ult_59_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [59,59]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [59,59]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -34945,7 +34945,7 @@ define <2 x i64> @ult_59_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [59,59]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [59,59]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -35066,7 +35066,7 @@ define <2 x i64> @ugt_59_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_59_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -35225,7 +35225,7 @@ define <2 x i64> @ult_60_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_60_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -35235,7 +35235,7 @@ define <2 x i64> @ult_60_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [60,60]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [60,60]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -35243,7 +35243,7 @@ define <2 x i64> @ult_60_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [60,60]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [60,60]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -35261,7 +35261,7 @@ define <2 x i64> @ult_60_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [60,60]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [60,60]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -35382,7 +35382,7 @@ define <2 x i64> @ugt_60_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_60_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -35541,7 +35541,7 @@ define <2 x i64> @ult_61_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_61_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -35551,7 +35551,7 @@ define <2 x i64> @ult_61_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [61,61]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [61,61]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -35559,7 +35559,7 @@ define <2 x i64> @ult_61_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [61,61]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [61,61]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -35577,7 +35577,7 @@ define <2 x i64> @ult_61_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [61,61]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [61,61]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -35698,7 +35698,7 @@ define <2 x i64> @ugt_61_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_61_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -35857,7 +35857,7 @@ define <2 x i64> @ult_62_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_62_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -35867,7 +35867,7 @@ define <2 x i64> @ult_62_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [62,62]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [62,62]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -35875,7 +35875,7 @@ define <2 x i64> @ult_62_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [62,62]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [62,62]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -35893,7 +35893,7 @@ define <2 x i64> @ult_62_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [62,62]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [62,62]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
@@ -36014,7 +36014,7 @@ define <2 x i64> @ugt_62_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_62_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -36173,7 +36173,7 @@ define <2 x i64> @ult_63_v2i64(<2 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_63_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -36183,7 +36183,7 @@ define <2 x i64> @ult_63_v2i64(<2 x i64> %0) {
 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [63,63]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,63]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -36191,7 +36191,7 @@ define <2 x i64> @ult_63_v2i64(<2 x i64> %0) {
 ; AVX512VPOPCNTDQ:       # %bb.0:
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [63,63]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,63]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
 ; AVX512VPOPCNTDQ-NEXT:    retq
@@ -36209,7 +36209,7 @@ define <2 x i64> @ult_63_v2i64(<2 x i64> %0) {
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [63,63]
+; BITALG_NOVLX-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,63]
 ; BITALG_NOVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/vector-popcnt-128.ll b/llvm/test/CodeGen/X86/vector-popcnt-128.ll
index a03e34dc46b32..21792140625cd 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-128.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-128.ll
@@ -3,8 +3,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE3
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1OR2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=AVX,XOP
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VPOPCNTDQ
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VPOPCNTDQVL
@@ -84,19 +84,33 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ; SSE41-NEXT:    psadbw %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: testv2i64:
-; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1OR2-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1OR2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1OR2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX1OR2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1OR2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT:    retq
+; AVX1-LABEL: testv2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: testv2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; XOP-LABEL: testv2i64:
 ; XOP:       # %bb.0:
@@ -235,23 +249,41 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ; SSE41-NEXT:    packuswb %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: testv4i32:
-; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1OR2-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1OR2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1OR2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX1OR2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1OR2-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1OR2-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
-; AVX1OR2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1OR2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1OR2-NEXT:    retq
+; AVX1-LABEL: testv4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: testv4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; XOP-LABEL: testv4i32:
 ; XOP:       # %bb.0:
@@ -390,20 +422,35 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ; SSE41-NEXT:    psrlw $8, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: testv8i16:
-; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1OR2-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1OR2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1OR2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX1OR2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpsllw $8, %xmm0, %xmm1
-; AVX1OR2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; AVX1OR2-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX1OR2-NEXT:    retq
+; AVX1-LABEL: testv8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: testv8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsllw $8, %xmm0, %xmm1
+; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; XOP-LABEL: testv8i16:
 ; XOP:       # %bb.0:
@@ -518,17 +565,29 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: testv16i8:
-; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1OR2-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1OR2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1OR2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX1OR2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX1OR2-NEXT:    retq
+; AVX1-LABEL: testv16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: testv16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; XOP-LABEL: testv16i8:
 ; XOP:       # %bb.0:

diff  --git a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll
index c5bb1dfe6001b..fad3effc66f9f 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll
@@ -119,14 +119,14 @@ define <32 x i8> @ult_2_v32i8(<32 x i8> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; BITALG_NOVLX-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
 ; BITALG_NOVLX-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_2_v32i8:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntb %ymm0, %ymm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; BITALG-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
 ; BITALG-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; BITALG-NEXT:    retq
   %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0)
@@ -161,9 +161,10 @@ define <32 x i8> @ugt_2_v32i8(<32 x i8> %0) {
 ;
 ; AVX2-LABEL: ugt_2_v32i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -174,9 +175,10 @@ define <32 x i8> @ugt_2_v32i8(<32 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_2_v32i8:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -187,9 +189,10 @@ define <32 x i8> @ugt_2_v32i8(<32 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQVL-LABEL: ugt_2_v32i8:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -242,43 +245,46 @@ define <32 x i8> @ult_3_v32i8(<32 x i8> %0) {
 ;
 ; AVX2-LABEL: ult_3_v32i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
 ; AVX2-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_3_v32i8:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQVL-LABEL: ult_3_v32i8:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
@@ -286,14 +292,14 @@ define <32 x i8> @ult_3_v32i8(<32 x i8> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; BITALG_NOVLX-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
 ; BITALG_NOVLX-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_3_v32i8:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntb %ymm0, %ymm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; BITALG-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
 ; BITALG-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; BITALG-NEXT:    retq
   %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0)
@@ -328,9 +334,10 @@ define <32 x i8> @ugt_3_v32i8(<32 x i8> %0) {
 ;
 ; AVX2-LABEL: ugt_3_v32i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -341,9 +348,10 @@ define <32 x i8> @ugt_3_v32i8(<32 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_3_v32i8:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -354,9 +362,10 @@ define <32 x i8> @ugt_3_v32i8(<32 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQVL-LABEL: ugt_3_v32i8:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -409,43 +418,46 @@ define <32 x i8> @ult_4_v32i8(<32 x i8> %0) {
 ;
 ; AVX2-LABEL: ult_4_v32i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
 ; AVX2-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_4_v32i8:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQVL-LABEL: ult_4_v32i8:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
@@ -453,14 +465,14 @@ define <32 x i8> @ult_4_v32i8(<32 x i8> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; BITALG_NOVLX-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
 ; BITALG_NOVLX-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_4_v32i8:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntb %ymm0, %ymm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; BITALG-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
 ; BITALG-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; BITALG-NEXT:    retq
   %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0)
@@ -495,9 +507,10 @@ define <32 x i8> @ugt_4_v32i8(<32 x i8> %0) {
 ;
 ; AVX2-LABEL: ugt_4_v32i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -508,9 +521,10 @@ define <32 x i8> @ugt_4_v32i8(<32 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_4_v32i8:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -521,9 +535,10 @@ define <32 x i8> @ugt_4_v32i8(<32 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQVL-LABEL: ugt_4_v32i8:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -576,43 +591,46 @@ define <32 x i8> @ult_5_v32i8(<32 x i8> %0) {
 ;
 ; AVX2-LABEL: ult_5_v32i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
 ; AVX2-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_5_v32i8:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQVL-LABEL: ult_5_v32i8:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
@@ -620,14 +638,14 @@ define <32 x i8> @ult_5_v32i8(<32 x i8> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; BITALG_NOVLX-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
 ; BITALG_NOVLX-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_5_v32i8:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntb %ymm0, %ymm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; BITALG-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
 ; BITALG-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; BITALG-NEXT:    retq
   %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0)
@@ -662,9 +680,10 @@ define <32 x i8> @ugt_5_v32i8(<32 x i8> %0) {
 ;
 ; AVX2-LABEL: ugt_5_v32i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -675,9 +694,10 @@ define <32 x i8> @ugt_5_v32i8(<32 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_5_v32i8:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -688,9 +708,10 @@ define <32 x i8> @ugt_5_v32i8(<32 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQVL-LABEL: ugt_5_v32i8:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -743,43 +764,46 @@ define <32 x i8> @ult_6_v32i8(<32 x i8> %0) {
 ;
 ; AVX2-LABEL: ult_6_v32i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
 ; AVX2-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_6_v32i8:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQVL-LABEL: ult_6_v32i8:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
@@ -787,14 +811,14 @@ define <32 x i8> @ult_6_v32i8(<32 x i8> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; BITALG_NOVLX-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
 ; BITALG_NOVLX-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_6_v32i8:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntb %ymm0, %ymm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; BITALG-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
 ; BITALG-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; BITALG-NEXT:    retq
   %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0)
@@ -829,9 +853,10 @@ define <32 x i8> @ugt_6_v32i8(<32 x i8> %0) {
 ;
 ; AVX2-LABEL: ugt_6_v32i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -842,9 +867,10 @@ define <32 x i8> @ugt_6_v32i8(<32 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQ-LABEL: ugt_6_v32i8:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -855,9 +881,10 @@ define <32 x i8> @ugt_6_v32i8(<32 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQVL-LABEL: ugt_6_v32i8:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -910,43 +937,46 @@ define <32 x i8> @ult_7_v32i8(<32 x i8> %0) {
 ;
 ; AVX2-LABEL: ult_7_v32i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; AVX2-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: ult_7_v32i8:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQVL-LABEL: ult_7_v32i8:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
@@ -954,14 +984,14 @@ define <32 x i8> @ult_7_v32i8(<32 x i8> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; BITALG_NOVLX-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; BITALG_NOVLX-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_7_v32i8:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntb %ymm0, %ymm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; BITALG-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; BITALG-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; BITALG-NEXT:    retq
   %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0)
@@ -1082,14 +1112,14 @@ define <16 x i16> @ult_2_v16i16(<16 x i16> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; BITALG_NOVLX-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
 ; BITALG_NOVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_2_v16i16:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntw %ymm0, %ymm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; BITALG-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
 ; BITALG-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0)
@@ -1130,9 +1160,10 @@ define <16 x i16> @ugt_2_v16i16(<16 x i16> %0) {
 ;
 ; AVX2-LABEL: ugt_2_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -1210,9 +1241,10 @@ define <16 x i16> @ult_3_v16i16(<16 x i16> %0) {
 ;
 ; AVX2-LABEL: ult_3_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -1221,7 +1253,7 @@ define <16 x i16> @ult_3_v16i16(<16 x i16> %0) {
 ; AVX2-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
 ; AVX2-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
@@ -1230,7 +1262,7 @@ define <16 x i16> @ult_3_v16i16(<16 x i16> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
@@ -1239,7 +1271,7 @@ define <16 x i16> @ult_3_v16i16(<16 x i16> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
@@ -1247,14 +1279,14 @@ define <16 x i16> @ult_3_v16i16(<16 x i16> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; BITALG_NOVLX-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
 ; BITALG_NOVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_3_v16i16:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntw %ymm0, %ymm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; BITALG-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
 ; BITALG-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0)
@@ -1295,9 +1327,10 @@ define <16 x i16> @ugt_3_v16i16(<16 x i16> %0) {
 ;
 ; AVX2-LABEL: ugt_3_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -1375,9 +1408,10 @@ define <16 x i16> @ult_4_v16i16(<16 x i16> %0) {
 ;
 ; AVX2-LABEL: ult_4_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -1386,7 +1420,7 @@ define <16 x i16> @ult_4_v16i16(<16 x i16> %0) {
 ; AVX2-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
 ; AVX2-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
@@ -1395,7 +1429,7 @@ define <16 x i16> @ult_4_v16i16(<16 x i16> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
@@ -1404,7 +1438,7 @@ define <16 x i16> @ult_4_v16i16(<16 x i16> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
@@ -1412,14 +1446,14 @@ define <16 x i16> @ult_4_v16i16(<16 x i16> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; BITALG_NOVLX-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
 ; BITALG_NOVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_4_v16i16:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntw %ymm0, %ymm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; BITALG-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
 ; BITALG-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0)
@@ -1460,9 +1494,10 @@ define <16 x i16> @ugt_4_v16i16(<16 x i16> %0) {
 ;
 ; AVX2-LABEL: ugt_4_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -1540,9 +1575,10 @@ define <16 x i16> @ult_5_v16i16(<16 x i16> %0) {
 ;
 ; AVX2-LABEL: ult_5_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -1551,7 +1587,7 @@ define <16 x i16> @ult_5_v16i16(<16 x i16> %0) {
 ; AVX2-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
 ; AVX2-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
@@ -1560,7 +1596,7 @@ define <16 x i16> @ult_5_v16i16(<16 x i16> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
@@ -1569,7 +1605,7 @@ define <16 x i16> @ult_5_v16i16(<16 x i16> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
@@ -1577,14 +1613,14 @@ define <16 x i16> @ult_5_v16i16(<16 x i16> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; BITALG_NOVLX-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
 ; BITALG_NOVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_5_v16i16:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntw %ymm0, %ymm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; BITALG-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
 ; BITALG-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0)
@@ -1625,9 +1661,10 @@ define <16 x i16> @ugt_5_v16i16(<16 x i16> %0) {
 ;
 ; AVX2-LABEL: ugt_5_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -1705,9 +1742,10 @@ define <16 x i16> @ult_6_v16i16(<16 x i16> %0) {
 ;
 ; AVX2-LABEL: ult_6_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -1716,7 +1754,7 @@ define <16 x i16> @ult_6_v16i16(<16 x i16> %0) {
 ; AVX2-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
 ; AVX2-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
@@ -1725,7 +1763,7 @@ define <16 x i16> @ult_6_v16i16(<16 x i16> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
@@ -1734,7 +1772,7 @@ define <16 x i16> @ult_6_v16i16(<16 x i16> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
@@ -1742,14 +1780,14 @@ define <16 x i16> @ult_6_v16i16(<16 x i16> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; BITALG_NOVLX-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
 ; BITALG_NOVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_6_v16i16:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntw %ymm0, %ymm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; BITALG-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
 ; BITALG-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0)
@@ -1790,9 +1828,10 @@ define <16 x i16> @ugt_6_v16i16(<16 x i16> %0) {
 ;
 ; AVX2-LABEL: ugt_6_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -1870,9 +1909,10 @@ define <16 x i16> @ult_7_v16i16(<16 x i16> %0) {
 ;
 ; AVX2-LABEL: ult_7_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -1881,7 +1921,7 @@ define <16 x i16> @ult_7_v16i16(<16 x i16> %0) {
 ; AVX2-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; AVX2-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
@@ -1890,7 +1930,7 @@ define <16 x i16> @ult_7_v16i16(<16 x i16> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
@@ -1899,7 +1939,7 @@ define <16 x i16> @ult_7_v16i16(<16 x i16> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
@@ -1907,14 +1947,14 @@ define <16 x i16> @ult_7_v16i16(<16 x i16> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; BITALG_NOVLX-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; BITALG_NOVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_7_v16i16:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntw %ymm0, %ymm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; BITALG-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; BITALG-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0)
@@ -1955,9 +1995,10 @@ define <16 x i16> @ugt_7_v16i16(<16 x i16> %0) {
 ;
 ; AVX2-LABEL: ugt_7_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -2035,9 +2076,10 @@ define <16 x i16> @ult_8_v16i16(<16 x i16> %0) {
 ;
 ; AVX2-LABEL: ult_8_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -2046,7 +2088,7 @@ define <16 x i16> @ult_8_v16i16(<16 x i16> %0) {
 ; AVX2-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; AVX2-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
@@ -2055,7 +2097,7 @@ define <16 x i16> @ult_8_v16i16(<16 x i16> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
@@ -2064,7 +2106,7 @@ define <16 x i16> @ult_8_v16i16(<16 x i16> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
@@ -2072,14 +2114,14 @@ define <16 x i16> @ult_8_v16i16(<16 x i16> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; BITALG_NOVLX-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; BITALG_NOVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_8_v16i16:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntw %ymm0, %ymm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; BITALG-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; BITALG-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0)
@@ -2120,9 +2162,10 @@ define <16 x i16> @ugt_8_v16i16(<16 x i16> %0) {
 ;
 ; AVX2-LABEL: ugt_8_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -2200,9 +2243,10 @@ define <16 x i16> @ult_9_v16i16(<16 x i16> %0) {
 ;
 ; AVX2-LABEL: ult_9_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -2211,7 +2255,7 @@ define <16 x i16> @ult_9_v16i16(<16 x i16> %0) {
 ; AVX2-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
 ; AVX2-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
@@ -2220,7 +2264,7 @@ define <16 x i16> @ult_9_v16i16(<16 x i16> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
@@ -2229,7 +2273,7 @@ define <16 x i16> @ult_9_v16i16(<16 x i16> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
@@ -2237,14 +2281,14 @@ define <16 x i16> @ult_9_v16i16(<16 x i16> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
+; BITALG_NOVLX-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
 ; BITALG_NOVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_9_v16i16:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntw %ymm0, %ymm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
+; BITALG-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
 ; BITALG-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0)
@@ -2285,9 +2329,10 @@ define <16 x i16> @ugt_9_v16i16(<16 x i16> %0) {
 ;
 ; AVX2-LABEL: ugt_9_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -2365,9 +2410,10 @@ define <16 x i16> @ult_10_v16i16(<16 x i16> %0) {
 ;
 ; AVX2-LABEL: ult_10_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -2376,7 +2422,7 @@ define <16 x i16> @ult_10_v16i16(<16 x i16> %0) {
 ; AVX2-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
 ; AVX2-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
@@ -2385,7 +2431,7 @@ define <16 x i16> @ult_10_v16i16(<16 x i16> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
@@ -2394,7 +2440,7 @@ define <16 x i16> @ult_10_v16i16(<16 x i16> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
@@ -2402,14 +2448,14 @@ define <16 x i16> @ult_10_v16i16(<16 x i16> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
+; BITALG_NOVLX-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
 ; BITALG_NOVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_10_v16i16:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntw %ymm0, %ymm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
+; BITALG-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
 ; BITALG-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0)
@@ -2450,9 +2496,10 @@ define <16 x i16> @ugt_10_v16i16(<16 x i16> %0) {
 ;
 ; AVX2-LABEL: ugt_10_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -2530,9 +2577,10 @@ define <16 x i16> @ult_11_v16i16(<16 x i16> %0) {
 ;
 ; AVX2-LABEL: ult_11_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -2541,7 +2589,7 @@ define <16 x i16> @ult_11_v16i16(<16 x i16> %0) {
 ; AVX2-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
 ; AVX2-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
@@ -2550,7 +2598,7 @@ define <16 x i16> @ult_11_v16i16(<16 x i16> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
@@ -2559,7 +2607,7 @@ define <16 x i16> @ult_11_v16i16(<16 x i16> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
@@ -2567,14 +2615,14 @@ define <16 x i16> @ult_11_v16i16(<16 x i16> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
+; BITALG_NOVLX-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
 ; BITALG_NOVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_11_v16i16:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntw %ymm0, %ymm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
+; BITALG-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
 ; BITALG-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0)
@@ -2615,9 +2663,10 @@ define <16 x i16> @ugt_11_v16i16(<16 x i16> %0) {
 ;
 ; AVX2-LABEL: ugt_11_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -2695,9 +2744,10 @@ define <16 x i16> @ult_12_v16i16(<16 x i16> %0) {
 ;
 ; AVX2-LABEL: ult_12_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -2706,7 +2756,7 @@ define <16 x i16> @ult_12_v16i16(<16 x i16> %0) {
 ; AVX2-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
 ; AVX2-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
@@ -2715,7 +2765,7 @@ define <16 x i16> @ult_12_v16i16(<16 x i16> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
@@ -2724,7 +2774,7 @@ define <16 x i16> @ult_12_v16i16(<16 x i16> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
@@ -2732,14 +2782,14 @@ define <16 x i16> @ult_12_v16i16(<16 x i16> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
+; BITALG_NOVLX-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
 ; BITALG_NOVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_12_v16i16:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntw %ymm0, %ymm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
+; BITALG-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
 ; BITALG-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0)
@@ -2780,9 +2830,10 @@ define <16 x i16> @ugt_12_v16i16(<16 x i16> %0) {
 ;
 ; AVX2-LABEL: ugt_12_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -2860,9 +2911,10 @@ define <16 x i16> @ult_13_v16i16(<16 x i16> %0) {
 ;
 ; AVX2-LABEL: ult_13_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -2871,7 +2923,7 @@ define <16 x i16> @ult_13_v16i16(<16 x i16> %0) {
 ; AVX2-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
 ; AVX2-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
@@ -2880,7 +2932,7 @@ define <16 x i16> @ult_13_v16i16(<16 x i16> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
@@ -2889,7 +2941,7 @@ define <16 x i16> @ult_13_v16i16(<16 x i16> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
@@ -2897,14 +2949,14 @@ define <16 x i16> @ult_13_v16i16(<16 x i16> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
+; BITALG_NOVLX-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
 ; BITALG_NOVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_13_v16i16:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntw %ymm0, %ymm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
+; BITALG-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
 ; BITALG-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0)
@@ -2945,9 +2997,10 @@ define <16 x i16> @ugt_13_v16i16(<16 x i16> %0) {
 ;
 ; AVX2-LABEL: ugt_13_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -3025,9 +3078,10 @@ define <16 x i16> @ult_14_v16i16(<16 x i16> %0) {
 ;
 ; AVX2-LABEL: ult_14_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -3036,7 +3090,7 @@ define <16 x i16> @ult_14_v16i16(<16 x i16> %0) {
 ; AVX2-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
 ; AVX2-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
@@ -3045,7 +3099,7 @@ define <16 x i16> @ult_14_v16i16(<16 x i16> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
@@ -3054,7 +3108,7 @@ define <16 x i16> @ult_14_v16i16(<16 x i16> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
@@ -3062,14 +3116,14 @@ define <16 x i16> @ult_14_v16i16(<16 x i16> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
+; BITALG_NOVLX-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
 ; BITALG_NOVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_14_v16i16:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntw %ymm0, %ymm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
+; BITALG-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
 ; BITALG-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0)
@@ -3110,9 +3164,10 @@ define <16 x i16> @ugt_14_v16i16(<16 x i16> %0) {
 ;
 ; AVX2-LABEL: ugt_14_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -3190,9 +3245,10 @@ define <16 x i16> @ult_15_v16i16(<16 x i16> %0) {
 ;
 ; AVX2-LABEL: ult_15_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -3201,7 +3257,7 @@ define <16 x i16> @ult_15_v16i16(<16 x i16> %0) {
 ; AVX2-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
@@ -3210,7 +3266,7 @@ define <16 x i16> @ult_15_v16i16(<16 x i16> %0) {
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
@@ -3219,7 +3275,7 @@ define <16 x i16> @ult_15_v16i16(<16 x i16> %0) {
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
@@ -3227,14 +3283,14 @@ define <16 x i16> @ult_15_v16i16(<16 x i16> %0) {
 ; BITALG_NOVLX:       # %bb.0:
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG_NOVLX-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; BITALG_NOVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: ult_15_v16i16:
 ; BITALG:       # %bb.0:
 ; BITALG-NEXT:    vpopcntw %ymm0, %ymm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; BITALG-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; BITALG-NEXT:    retq
   %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0)
@@ -3410,9 +3466,10 @@ define <8 x i32> @ugt_2_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_2_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -3512,9 +3569,10 @@ define <8 x i32> @ult_3_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_3_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -3614,9 +3672,10 @@ define <8 x i32> @ugt_3_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_3_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -3716,9 +3775,10 @@ define <8 x i32> @ult_4_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_4_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -3818,9 +3878,10 @@ define <8 x i32> @ugt_4_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_4_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -3920,9 +3981,10 @@ define <8 x i32> @ult_5_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_5_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -4022,9 +4084,10 @@ define <8 x i32> @ugt_5_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_5_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -4124,9 +4187,10 @@ define <8 x i32> @ult_6_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_6_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -4226,9 +4290,10 @@ define <8 x i32> @ugt_6_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_6_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -4328,9 +4393,10 @@ define <8 x i32> @ult_7_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_7_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -4430,9 +4496,10 @@ define <8 x i32> @ugt_7_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_7_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -4532,9 +4599,10 @@ define <8 x i32> @ult_8_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_8_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -4634,9 +4702,10 @@ define <8 x i32> @ugt_8_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_8_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -4736,9 +4805,10 @@ define <8 x i32> @ult_9_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_9_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -4838,9 +4908,10 @@ define <8 x i32> @ugt_9_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_9_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -4940,9 +5011,10 @@ define <8 x i32> @ult_10_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_10_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -5042,9 +5114,10 @@ define <8 x i32> @ugt_10_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_10_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -5144,9 +5217,10 @@ define <8 x i32> @ult_11_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_11_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -5246,9 +5320,10 @@ define <8 x i32> @ugt_11_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_11_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -5348,9 +5423,10 @@ define <8 x i32> @ult_12_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_12_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -5450,9 +5526,10 @@ define <8 x i32> @ugt_12_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_12_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -5552,9 +5629,10 @@ define <8 x i32> @ult_13_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_13_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -5654,9 +5732,10 @@ define <8 x i32> @ugt_13_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_13_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -5756,9 +5835,10 @@ define <8 x i32> @ult_14_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_14_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -5858,9 +5938,10 @@ define <8 x i32> @ugt_14_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_14_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -5960,9 +6041,10 @@ define <8 x i32> @ult_15_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_15_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -6062,9 +6144,10 @@ define <8 x i32> @ugt_15_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_15_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -6164,9 +6247,10 @@ define <8 x i32> @ult_16_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_16_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -6266,9 +6350,10 @@ define <8 x i32> @ugt_16_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_16_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -6368,9 +6453,10 @@ define <8 x i32> @ult_17_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_17_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -6470,9 +6556,10 @@ define <8 x i32> @ugt_17_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_17_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -6572,9 +6659,10 @@ define <8 x i32> @ult_18_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_18_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -6674,9 +6762,10 @@ define <8 x i32> @ugt_18_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_18_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -6776,9 +6865,10 @@ define <8 x i32> @ult_19_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_19_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -6878,9 +6968,10 @@ define <8 x i32> @ugt_19_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_19_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -6980,9 +7071,10 @@ define <8 x i32> @ult_20_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_20_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -7082,9 +7174,10 @@ define <8 x i32> @ugt_20_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_20_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -7184,9 +7277,10 @@ define <8 x i32> @ult_21_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_21_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -7286,9 +7380,10 @@ define <8 x i32> @ugt_21_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_21_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -7388,9 +7483,10 @@ define <8 x i32> @ult_22_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_22_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -7490,9 +7586,10 @@ define <8 x i32> @ugt_22_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_22_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -7592,9 +7689,10 @@ define <8 x i32> @ult_23_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_23_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -7694,9 +7792,10 @@ define <8 x i32> @ugt_23_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_23_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -7796,9 +7895,10 @@ define <8 x i32> @ult_24_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_24_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -7898,9 +7998,10 @@ define <8 x i32> @ugt_24_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_24_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -8000,9 +8101,10 @@ define <8 x i32> @ult_25_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_25_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -8102,9 +8204,10 @@ define <8 x i32> @ugt_25_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_25_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -8204,9 +8307,10 @@ define <8 x i32> @ult_26_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_26_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -8306,9 +8410,10 @@ define <8 x i32> @ugt_26_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_26_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -8408,9 +8513,10 @@ define <8 x i32> @ult_27_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_27_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -8510,9 +8616,10 @@ define <8 x i32> @ugt_27_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_27_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -8612,9 +8719,10 @@ define <8 x i32> @ult_28_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_28_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -8714,9 +8822,10 @@ define <8 x i32> @ugt_28_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_28_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -8816,9 +8925,10 @@ define <8 x i32> @ult_29_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_29_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -8918,9 +9028,10 @@ define <8 x i32> @ugt_29_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_29_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -9020,9 +9131,10 @@ define <8 x i32> @ult_30_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_30_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -9122,9 +9234,10 @@ define <8 x i32> @ugt_30_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ugt_30_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -9224,9 +9337,10 @@ define <8 x i32> @ult_31_v8i32(<8 x i32> %0) {
 ;
 ; AVX2-LABEL: ult_31_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -9448,9 +9562,10 @@ define <4 x i64> @ugt_2_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_2_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -9530,9 +9645,10 @@ define <4 x i64> @ult_3_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_3_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -9612,9 +9728,10 @@ define <4 x i64> @ugt_3_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_3_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -9694,9 +9811,10 @@ define <4 x i64> @ult_4_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_4_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -9776,9 +9894,10 @@ define <4 x i64> @ugt_4_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_4_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -9858,9 +9977,10 @@ define <4 x i64> @ult_5_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_5_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -9940,9 +10060,10 @@ define <4 x i64> @ugt_5_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_5_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -10022,9 +10143,10 @@ define <4 x i64> @ult_6_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_6_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -10104,9 +10226,10 @@ define <4 x i64> @ugt_6_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_6_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -10186,9 +10309,10 @@ define <4 x i64> @ult_7_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_7_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -10268,9 +10392,10 @@ define <4 x i64> @ugt_7_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_7_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -10350,9 +10475,10 @@ define <4 x i64> @ult_8_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_8_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -10432,9 +10558,10 @@ define <4 x i64> @ugt_8_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_8_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -10514,9 +10641,10 @@ define <4 x i64> @ult_9_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_9_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -10596,9 +10724,10 @@ define <4 x i64> @ugt_9_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_9_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -10678,9 +10807,10 @@ define <4 x i64> @ult_10_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_10_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -10760,9 +10890,10 @@ define <4 x i64> @ugt_10_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_10_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -10842,9 +10973,10 @@ define <4 x i64> @ult_11_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_11_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -10924,9 +11056,10 @@ define <4 x i64> @ugt_11_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_11_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -11006,9 +11139,10 @@ define <4 x i64> @ult_12_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_12_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -11088,9 +11222,10 @@ define <4 x i64> @ugt_12_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_12_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -11170,9 +11305,10 @@ define <4 x i64> @ult_13_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_13_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -11252,9 +11388,10 @@ define <4 x i64> @ugt_13_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_13_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -11334,9 +11471,10 @@ define <4 x i64> @ult_14_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_14_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -11416,9 +11554,10 @@ define <4 x i64> @ugt_14_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_14_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -11498,9 +11637,10 @@ define <4 x i64> @ult_15_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_15_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -11580,9 +11720,10 @@ define <4 x i64> @ugt_15_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_15_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -11662,9 +11803,10 @@ define <4 x i64> @ult_16_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_16_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -11744,9 +11886,10 @@ define <4 x i64> @ugt_16_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_16_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -11826,9 +11969,10 @@ define <4 x i64> @ult_17_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_17_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -11908,9 +12052,10 @@ define <4 x i64> @ugt_17_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_17_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -11990,9 +12135,10 @@ define <4 x i64> @ult_18_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_18_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -12072,9 +12218,10 @@ define <4 x i64> @ugt_18_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_18_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -12154,9 +12301,10 @@ define <4 x i64> @ult_19_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_19_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -12236,9 +12384,10 @@ define <4 x i64> @ugt_19_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_19_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -12318,9 +12467,10 @@ define <4 x i64> @ult_20_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_20_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -12400,9 +12550,10 @@ define <4 x i64> @ugt_20_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_20_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -12482,9 +12633,10 @@ define <4 x i64> @ult_21_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_21_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -12564,9 +12716,10 @@ define <4 x i64> @ugt_21_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_21_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -12646,9 +12799,10 @@ define <4 x i64> @ult_22_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_22_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -12728,9 +12882,10 @@ define <4 x i64> @ugt_22_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_22_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -12810,9 +12965,10 @@ define <4 x i64> @ult_23_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_23_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -12892,9 +13048,10 @@ define <4 x i64> @ugt_23_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_23_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -12974,9 +13131,10 @@ define <4 x i64> @ult_24_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_24_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -13056,9 +13214,10 @@ define <4 x i64> @ugt_24_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_24_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -13138,9 +13297,10 @@ define <4 x i64> @ult_25_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_25_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -13220,9 +13380,10 @@ define <4 x i64> @ugt_25_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_25_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -13302,9 +13463,10 @@ define <4 x i64> @ult_26_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_26_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -13384,9 +13546,10 @@ define <4 x i64> @ugt_26_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_26_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -13466,9 +13629,10 @@ define <4 x i64> @ult_27_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_27_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -13548,9 +13712,10 @@ define <4 x i64> @ugt_27_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_27_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -13630,9 +13795,10 @@ define <4 x i64> @ult_28_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_28_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -13712,9 +13878,10 @@ define <4 x i64> @ugt_28_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_28_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -13794,9 +13961,10 @@ define <4 x i64> @ult_29_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_29_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -13876,9 +14044,10 @@ define <4 x i64> @ugt_29_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_29_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -13958,9 +14127,10 @@ define <4 x i64> @ult_30_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_30_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -14040,9 +14210,10 @@ define <4 x i64> @ugt_30_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_30_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -14122,9 +14293,10 @@ define <4 x i64> @ult_31_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_31_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -14204,9 +14376,10 @@ define <4 x i64> @ugt_31_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_31_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -14286,9 +14459,10 @@ define <4 x i64> @ult_32_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_32_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -14368,9 +14542,10 @@ define <4 x i64> @ugt_32_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_32_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -14450,9 +14625,10 @@ define <4 x i64> @ult_33_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_33_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -14532,9 +14708,10 @@ define <4 x i64> @ugt_33_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_33_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -14614,9 +14791,10 @@ define <4 x i64> @ult_34_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_34_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -14696,9 +14874,10 @@ define <4 x i64> @ugt_34_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_34_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -14778,9 +14957,10 @@ define <4 x i64> @ult_35_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_35_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -14860,9 +15040,10 @@ define <4 x i64> @ugt_35_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_35_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -14942,9 +15123,10 @@ define <4 x i64> @ult_36_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_36_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -15024,9 +15206,10 @@ define <4 x i64> @ugt_36_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_36_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -15106,9 +15289,10 @@ define <4 x i64> @ult_37_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_37_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -15188,9 +15372,10 @@ define <4 x i64> @ugt_37_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_37_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -15270,9 +15455,10 @@ define <4 x i64> @ult_38_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_38_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -15352,9 +15538,10 @@ define <4 x i64> @ugt_38_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_38_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -15434,9 +15621,10 @@ define <4 x i64> @ult_39_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_39_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -15516,9 +15704,10 @@ define <4 x i64> @ugt_39_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_39_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -15598,9 +15787,10 @@ define <4 x i64> @ult_40_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_40_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -15680,9 +15870,10 @@ define <4 x i64> @ugt_40_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_40_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -15762,9 +15953,10 @@ define <4 x i64> @ult_41_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_41_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -15844,9 +16036,10 @@ define <4 x i64> @ugt_41_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_41_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -15926,9 +16119,10 @@ define <4 x i64> @ult_42_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_42_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -16008,9 +16202,10 @@ define <4 x i64> @ugt_42_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_42_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -16090,9 +16285,10 @@ define <4 x i64> @ult_43_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_43_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -16172,9 +16368,10 @@ define <4 x i64> @ugt_43_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_43_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -16254,9 +16451,10 @@ define <4 x i64> @ult_44_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_44_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -16336,9 +16534,10 @@ define <4 x i64> @ugt_44_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_44_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -16418,9 +16617,10 @@ define <4 x i64> @ult_45_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_45_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -16500,9 +16700,10 @@ define <4 x i64> @ugt_45_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_45_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -16582,9 +16783,10 @@ define <4 x i64> @ult_46_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_46_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -16664,9 +16866,10 @@ define <4 x i64> @ugt_46_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_46_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -16746,9 +16949,10 @@ define <4 x i64> @ult_47_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_47_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -16828,9 +17032,10 @@ define <4 x i64> @ugt_47_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_47_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -16910,9 +17115,10 @@ define <4 x i64> @ult_48_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_48_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -16992,9 +17198,10 @@ define <4 x i64> @ugt_48_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_48_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -17074,9 +17281,10 @@ define <4 x i64> @ult_49_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_49_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -17156,9 +17364,10 @@ define <4 x i64> @ugt_49_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_49_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -17238,9 +17447,10 @@ define <4 x i64> @ult_50_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_50_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -17320,9 +17530,10 @@ define <4 x i64> @ugt_50_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_50_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -17402,9 +17613,10 @@ define <4 x i64> @ult_51_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_51_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -17484,9 +17696,10 @@ define <4 x i64> @ugt_51_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_51_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -17566,9 +17779,10 @@ define <4 x i64> @ult_52_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_52_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -17648,9 +17862,10 @@ define <4 x i64> @ugt_52_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_52_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -17730,9 +17945,10 @@ define <4 x i64> @ult_53_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_53_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -17812,9 +18028,10 @@ define <4 x i64> @ugt_53_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_53_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -17894,9 +18111,10 @@ define <4 x i64> @ult_54_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_54_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -17976,9 +18194,10 @@ define <4 x i64> @ugt_54_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_54_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -18058,9 +18277,10 @@ define <4 x i64> @ult_55_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_55_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -18140,9 +18360,10 @@ define <4 x i64> @ugt_55_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_55_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -18222,9 +18443,10 @@ define <4 x i64> @ult_56_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_56_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -18304,9 +18526,10 @@ define <4 x i64> @ugt_56_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_56_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -18386,9 +18609,10 @@ define <4 x i64> @ult_57_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_57_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -18468,9 +18692,10 @@ define <4 x i64> @ugt_57_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_57_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -18550,9 +18775,10 @@ define <4 x i64> @ult_58_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_58_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -18632,9 +18858,10 @@ define <4 x i64> @ugt_58_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_58_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -18714,9 +18941,10 @@ define <4 x i64> @ult_59_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_59_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -18796,9 +19024,10 @@ define <4 x i64> @ugt_59_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_59_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -18878,9 +19107,10 @@ define <4 x i64> @ult_60_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_60_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -18960,9 +19190,10 @@ define <4 x i64> @ugt_60_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_60_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -19042,9 +19273,10 @@ define <4 x i64> @ult_61_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_61_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -19124,9 +19356,10 @@ define <4 x i64> @ugt_61_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_61_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -19206,9 +19439,10 @@ define <4 x i64> @ult_62_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_62_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -19288,9 +19522,10 @@ define <4 x i64> @ugt_62_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ugt_62_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -19370,9 +19605,10 @@ define <4 x i64> @ult_63_v4i64(<4 x i64> %0) {
 ;
 ; AVX2-LABEL: ult_63_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0

diff  --git a/llvm/test/CodeGen/X86/vector-popcnt-256.ll b/llvm/test/CodeGen/X86/vector-popcnt-256.ll
index c35acdcf857d7..568c121409166 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-256.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-256.ll
@@ -33,9 +33,10 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ;
 ; AVX2-LABEL: testv4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -131,9 +132,10 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ;
 ; AVX2-LABEL: testv8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -244,9 +246,10 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ;
 ; AVX2-LABEL: testv16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -334,9 +337,10 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ;
 ; AVX2-LABEL: testv32i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -365,9 +369,10 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv32i8:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -377,9 +382,10 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv32i8:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0

diff  --git a/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll
index ef17822b1cf9b..182415f0ae5e2 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll
@@ -125,9 +125,10 @@ define <64 x i8> @ult_2_v64i8(<64 x i8> %0) {
 define <64 x i8> @ugt_2_v64i8(<64 x i8> %0) {
 ; AVX512F-LABEL: ugt_2_v64i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -140,7 +141,7 @@ define <64 x i8> @ugt_2_v64i8(<64 x i8> %0) {
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512F-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
 ; AVX512F-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpcmpgtb %ymm1, %ymm2, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -148,9 +149,10 @@ define <64 x i8> @ugt_2_v64i8(<64 x i8> %0) {
 ;
 ; AVX512BW-LABEL: ugt_2_v64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -162,9 +164,10 @@ define <64 x i8> @ugt_2_v64i8(<64 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQ-NOBW-LABEL: ugt_2_v64i8:
 ; AVX512VPOPCNTDQ-NOBW:       # %bb.0:
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -177,7 +180,7 @@ define <64 x i8> @ugt_2_v64i8(<64 x i8> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtb %ymm1, %ymm2, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -185,9 +188,10 @@ define <64 x i8> @ugt_2_v64i8(<64 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ugt_2_v64i8:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -212,9 +216,10 @@ define <64 x i8> @ugt_2_v64i8(<64 x i8> %0) {
 define <64 x i8> @ult_3_v64i8(<64 x i8> %0) {
 ; AVX512F-LABEL: ult_3_v64i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -227,7 +232,7 @@ define <64 x i8> @ult_3_v64i8(<64 x i8> %0) {
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512F-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
 ; AVX512F-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpcmpgtb %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -235,9 +240,10 @@ define <64 x i8> @ult_3_v64i8(<64 x i8> %0) {
 ;
 ; AVX512BW-LABEL: ult_3_v64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -249,9 +255,10 @@ define <64 x i8> @ult_3_v64i8(<64 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQ-NOBW-LABEL: ult_3_v64i8:
 ; AVX512VPOPCNTDQ-NOBW:       # %bb.0:
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -264,7 +271,7 @@ define <64 x i8> @ult_3_v64i8(<64 x i8> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtb %ymm2, %ymm1, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -272,9 +279,10 @@ define <64 x i8> @ult_3_v64i8(<64 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ult_3_v64i8:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -299,9 +307,10 @@ define <64 x i8> @ult_3_v64i8(<64 x i8> %0) {
 define <64 x i8> @ugt_3_v64i8(<64 x i8> %0) {
 ; AVX512F-LABEL: ugt_3_v64i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -314,7 +323,7 @@ define <64 x i8> @ugt_3_v64i8(<64 x i8> %0) {
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512F-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
 ; AVX512F-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpcmpgtb %ymm1, %ymm2, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -322,9 +331,10 @@ define <64 x i8> @ugt_3_v64i8(<64 x i8> %0) {
 ;
 ; AVX512BW-LABEL: ugt_3_v64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -336,9 +346,10 @@ define <64 x i8> @ugt_3_v64i8(<64 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQ-NOBW-LABEL: ugt_3_v64i8:
 ; AVX512VPOPCNTDQ-NOBW:       # %bb.0:
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -351,7 +362,7 @@ define <64 x i8> @ugt_3_v64i8(<64 x i8> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtb %ymm1, %ymm2, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -359,9 +370,10 @@ define <64 x i8> @ugt_3_v64i8(<64 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ugt_3_v64i8:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -386,9 +398,10 @@ define <64 x i8> @ugt_3_v64i8(<64 x i8> %0) {
 define <64 x i8> @ult_4_v64i8(<64 x i8> %0) {
 ; AVX512F-LABEL: ult_4_v64i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -401,7 +414,7 @@ define <64 x i8> @ult_4_v64i8(<64 x i8> %0) {
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512F-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
 ; AVX512F-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpcmpgtb %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -409,9 +422,10 @@ define <64 x i8> @ult_4_v64i8(<64 x i8> %0) {
 ;
 ; AVX512BW-LABEL: ult_4_v64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -423,9 +437,10 @@ define <64 x i8> @ult_4_v64i8(<64 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQ-NOBW-LABEL: ult_4_v64i8:
 ; AVX512VPOPCNTDQ-NOBW:       # %bb.0:
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -438,7 +453,7 @@ define <64 x i8> @ult_4_v64i8(<64 x i8> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtb %ymm2, %ymm1, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -446,9 +461,10 @@ define <64 x i8> @ult_4_v64i8(<64 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ult_4_v64i8:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -473,9 +489,10 @@ define <64 x i8> @ult_4_v64i8(<64 x i8> %0) {
 define <64 x i8> @ugt_4_v64i8(<64 x i8> %0) {
 ; AVX512F-LABEL: ugt_4_v64i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -488,7 +505,7 @@ define <64 x i8> @ugt_4_v64i8(<64 x i8> %0) {
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512F-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
 ; AVX512F-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpcmpgtb %ymm1, %ymm2, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -496,9 +513,10 @@ define <64 x i8> @ugt_4_v64i8(<64 x i8> %0) {
 ;
 ; AVX512BW-LABEL: ugt_4_v64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -510,9 +528,10 @@ define <64 x i8> @ugt_4_v64i8(<64 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQ-NOBW-LABEL: ugt_4_v64i8:
 ; AVX512VPOPCNTDQ-NOBW:       # %bb.0:
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -525,7 +544,7 @@ define <64 x i8> @ugt_4_v64i8(<64 x i8> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtb %ymm1, %ymm2, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -533,9 +552,10 @@ define <64 x i8> @ugt_4_v64i8(<64 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ugt_4_v64i8:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -560,9 +580,10 @@ define <64 x i8> @ugt_4_v64i8(<64 x i8> %0) {
 define <64 x i8> @ult_5_v64i8(<64 x i8> %0) {
 ; AVX512F-LABEL: ult_5_v64i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -575,7 +596,7 @@ define <64 x i8> @ult_5_v64i8(<64 x i8> %0) {
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512F-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
 ; AVX512F-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpcmpgtb %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -583,9 +604,10 @@ define <64 x i8> @ult_5_v64i8(<64 x i8> %0) {
 ;
 ; AVX512BW-LABEL: ult_5_v64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -597,9 +619,10 @@ define <64 x i8> @ult_5_v64i8(<64 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQ-NOBW-LABEL: ult_5_v64i8:
 ; AVX512VPOPCNTDQ-NOBW:       # %bb.0:
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -612,7 +635,7 @@ define <64 x i8> @ult_5_v64i8(<64 x i8> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtb %ymm2, %ymm1, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -620,9 +643,10 @@ define <64 x i8> @ult_5_v64i8(<64 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ult_5_v64i8:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -647,9 +671,10 @@ define <64 x i8> @ult_5_v64i8(<64 x i8> %0) {
 define <64 x i8> @ugt_5_v64i8(<64 x i8> %0) {
 ; AVX512F-LABEL: ugt_5_v64i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -662,7 +687,7 @@ define <64 x i8> @ugt_5_v64i8(<64 x i8> %0) {
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512F-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
 ; AVX512F-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpcmpgtb %ymm1, %ymm2, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -670,9 +695,10 @@ define <64 x i8> @ugt_5_v64i8(<64 x i8> %0) {
 ;
 ; AVX512BW-LABEL: ugt_5_v64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -684,9 +710,10 @@ define <64 x i8> @ugt_5_v64i8(<64 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQ-NOBW-LABEL: ugt_5_v64i8:
 ; AVX512VPOPCNTDQ-NOBW:       # %bb.0:
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -699,7 +726,7 @@ define <64 x i8> @ugt_5_v64i8(<64 x i8> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtb %ymm1, %ymm2, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -707,9 +734,10 @@ define <64 x i8> @ugt_5_v64i8(<64 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ugt_5_v64i8:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -734,9 +762,10 @@ define <64 x i8> @ugt_5_v64i8(<64 x i8> %0) {
 define <64 x i8> @ult_6_v64i8(<64 x i8> %0) {
 ; AVX512F-LABEL: ult_6_v64i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -749,7 +778,7 @@ define <64 x i8> @ult_6_v64i8(<64 x i8> %0) {
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512F-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
 ; AVX512F-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpcmpgtb %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -757,9 +786,10 @@ define <64 x i8> @ult_6_v64i8(<64 x i8> %0) {
 ;
 ; AVX512BW-LABEL: ult_6_v64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -771,9 +801,10 @@ define <64 x i8> @ult_6_v64i8(<64 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQ-NOBW-LABEL: ult_6_v64i8:
 ; AVX512VPOPCNTDQ-NOBW:       # %bb.0:
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -786,7 +817,7 @@ define <64 x i8> @ult_6_v64i8(<64 x i8> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtb %ymm2, %ymm1, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -794,9 +825,10 @@ define <64 x i8> @ult_6_v64i8(<64 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ult_6_v64i8:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -821,9 +853,10 @@ define <64 x i8> @ult_6_v64i8(<64 x i8> %0) {
 define <64 x i8> @ugt_6_v64i8(<64 x i8> %0) {
 ; AVX512F-LABEL: ugt_6_v64i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -836,7 +869,7 @@ define <64 x i8> @ugt_6_v64i8(<64 x i8> %0) {
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512F-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
 ; AVX512F-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpcmpgtb %ymm1, %ymm2, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -844,9 +877,10 @@ define <64 x i8> @ugt_6_v64i8(<64 x i8> %0) {
 ;
 ; AVX512BW-LABEL: ugt_6_v64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -858,9 +892,10 @@ define <64 x i8> @ugt_6_v64i8(<64 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQ-NOBW-LABEL: ugt_6_v64i8:
 ; AVX512VPOPCNTDQ-NOBW:       # %bb.0:
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -873,7 +908,7 @@ define <64 x i8> @ugt_6_v64i8(<64 x i8> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtb %ymm1, %ymm2, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -881,9 +916,10 @@ define <64 x i8> @ugt_6_v64i8(<64 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ugt_6_v64i8:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -908,9 +944,10 @@ define <64 x i8> @ugt_6_v64i8(<64 x i8> %0) {
 define <64 x i8> @ult_7_v64i8(<64 x i8> %0) {
 ; AVX512F-LABEL: ult_7_v64i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -923,7 +960,7 @@ define <64 x i8> @ult_7_v64i8(<64 x i8> %0) {
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512F-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; AVX512F-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpcmpgtb %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -931,9 +968,10 @@ define <64 x i8> @ult_7_v64i8(<64 x i8> %0) {
 ;
 ; AVX512BW-LABEL: ult_7_v64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -945,9 +983,10 @@ define <64 x i8> @ult_7_v64i8(<64 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQ-NOBW-LABEL: ult_7_v64i8:
 ; AVX512VPOPCNTDQ-NOBW:       # %bb.0:
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -960,7 +999,7 @@ define <64 x i8> @ult_7_v64i8(<64 x i8> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtb %ymm2, %ymm1, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -968,9 +1007,10 @@ define <64 x i8> @ult_7_v64i8(<64 x i8> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ult_7_v64i8:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -1111,9 +1151,10 @@ define <32 x i16> @ult_2_v32i16(<32 x i16> %0) {
 define <32 x i16> @ugt_2_v32i16(<32 x i16> %0) {
 ; AVX512F-LABEL: ugt_2_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -1132,7 +1173,7 @@ define <32 x i16> @ugt_2_v32i16(<32 x i16> %0) {
 ; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1140,9 +1181,10 @@ define <32 x i16> @ugt_2_v32i16(<32 x i16> %0) {
 ;
 ; AVX512BW-LABEL: ugt_2_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -1164,7 +1206,7 @@ define <32 x i16> @ugt_2_v32i16(<32 x i16> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm2, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm2, %ymm1, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1172,9 +1214,10 @@ define <32 x i16> @ugt_2_v32i16(<32 x i16> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ugt_2_v32i16:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -1202,9 +1245,10 @@ define <32 x i16> @ugt_2_v32i16(<32 x i16> %0) {
 define <32 x i16> @ult_3_v32i16(<32 x i16> %0) {
 ; AVX512F-LABEL: ult_3_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -1223,7 +1267,7 @@ define <32 x i16> @ult_3_v32i16(<32 x i16> %0) {
 ; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
 ; AVX512F-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpcmpgtw %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1231,9 +1275,10 @@ define <32 x i16> @ult_3_v32i16(<32 x i16> %0) {
 ;
 ; AVX512BW-LABEL: ult_3_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -1255,7 +1300,7 @@ define <32 x i16> @ult_3_v32i16(<32 x i16> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1263,9 +1308,10 @@ define <32 x i16> @ult_3_v32i16(<32 x i16> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ult_3_v32i16:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -1293,9 +1339,10 @@ define <32 x i16> @ult_3_v32i16(<32 x i16> %0) {
 define <32 x i16> @ugt_3_v32i16(<32 x i16> %0) {
 ; AVX512F-LABEL: ugt_3_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -1314,7 +1361,7 @@ define <32 x i16> @ugt_3_v32i16(<32 x i16> %0) {
 ; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1322,9 +1369,10 @@ define <32 x i16> @ugt_3_v32i16(<32 x i16> %0) {
 ;
 ; AVX512BW-LABEL: ugt_3_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -1346,7 +1394,7 @@ define <32 x i16> @ugt_3_v32i16(<32 x i16> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm2, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm2, %ymm1, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1354,9 +1402,10 @@ define <32 x i16> @ugt_3_v32i16(<32 x i16> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ugt_3_v32i16:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -1384,9 +1433,10 @@ define <32 x i16> @ugt_3_v32i16(<32 x i16> %0) {
 define <32 x i16> @ult_4_v32i16(<32 x i16> %0) {
 ; AVX512F-LABEL: ult_4_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -1405,7 +1455,7 @@ define <32 x i16> @ult_4_v32i16(<32 x i16> %0) {
 ; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
 ; AVX512F-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpcmpgtw %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1413,9 +1463,10 @@ define <32 x i16> @ult_4_v32i16(<32 x i16> %0) {
 ;
 ; AVX512BW-LABEL: ult_4_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -1437,7 +1488,7 @@ define <32 x i16> @ult_4_v32i16(<32 x i16> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1445,9 +1496,10 @@ define <32 x i16> @ult_4_v32i16(<32 x i16> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ult_4_v32i16:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -1475,9 +1527,10 @@ define <32 x i16> @ult_4_v32i16(<32 x i16> %0) {
 define <32 x i16> @ugt_4_v32i16(<32 x i16> %0) {
 ; AVX512F-LABEL: ugt_4_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -1496,7 +1549,7 @@ define <32 x i16> @ugt_4_v32i16(<32 x i16> %0) {
 ; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1504,9 +1557,10 @@ define <32 x i16> @ugt_4_v32i16(<32 x i16> %0) {
 ;
 ; AVX512BW-LABEL: ugt_4_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -1528,7 +1582,7 @@ define <32 x i16> @ugt_4_v32i16(<32 x i16> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm2, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm2, %ymm1, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1536,9 +1590,10 @@ define <32 x i16> @ugt_4_v32i16(<32 x i16> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ugt_4_v32i16:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -1566,9 +1621,10 @@ define <32 x i16> @ugt_4_v32i16(<32 x i16> %0) {
 define <32 x i16> @ult_5_v32i16(<32 x i16> %0) {
 ; AVX512F-LABEL: ult_5_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -1587,7 +1643,7 @@ define <32 x i16> @ult_5_v32i16(<32 x i16> %0) {
 ; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
 ; AVX512F-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpcmpgtw %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1595,9 +1651,10 @@ define <32 x i16> @ult_5_v32i16(<32 x i16> %0) {
 ;
 ; AVX512BW-LABEL: ult_5_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -1619,7 +1676,7 @@ define <32 x i16> @ult_5_v32i16(<32 x i16> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1627,9 +1684,10 @@ define <32 x i16> @ult_5_v32i16(<32 x i16> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ult_5_v32i16:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -1657,9 +1715,10 @@ define <32 x i16> @ult_5_v32i16(<32 x i16> %0) {
 define <32 x i16> @ugt_5_v32i16(<32 x i16> %0) {
 ; AVX512F-LABEL: ugt_5_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -1678,7 +1737,7 @@ define <32 x i16> @ugt_5_v32i16(<32 x i16> %0) {
 ; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1686,9 +1745,10 @@ define <32 x i16> @ugt_5_v32i16(<32 x i16> %0) {
 ;
 ; AVX512BW-LABEL: ugt_5_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -1710,7 +1770,7 @@ define <32 x i16> @ugt_5_v32i16(<32 x i16> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm2, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm2, %ymm1, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1718,9 +1778,10 @@ define <32 x i16> @ugt_5_v32i16(<32 x i16> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ugt_5_v32i16:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -1748,9 +1809,10 @@ define <32 x i16> @ugt_5_v32i16(<32 x i16> %0) {
 define <32 x i16> @ult_6_v32i16(<32 x i16> %0) {
 ; AVX512F-LABEL: ult_6_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -1769,7 +1831,7 @@ define <32 x i16> @ult_6_v32i16(<32 x i16> %0) {
 ; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
 ; AVX512F-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpcmpgtw %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1777,9 +1839,10 @@ define <32 x i16> @ult_6_v32i16(<32 x i16> %0) {
 ;
 ; AVX512BW-LABEL: ult_6_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -1801,7 +1864,7 @@ define <32 x i16> @ult_6_v32i16(<32 x i16> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1809,9 +1872,10 @@ define <32 x i16> @ult_6_v32i16(<32 x i16> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ult_6_v32i16:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -1839,9 +1903,10 @@ define <32 x i16> @ult_6_v32i16(<32 x i16> %0) {
 define <32 x i16> @ugt_6_v32i16(<32 x i16> %0) {
 ; AVX512F-LABEL: ugt_6_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -1860,7 +1925,7 @@ define <32 x i16> @ugt_6_v32i16(<32 x i16> %0) {
 ; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1868,9 +1933,10 @@ define <32 x i16> @ugt_6_v32i16(<32 x i16> %0) {
 ;
 ; AVX512BW-LABEL: ugt_6_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -1892,7 +1958,7 @@ define <32 x i16> @ugt_6_v32i16(<32 x i16> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm2, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm2, %ymm1, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1900,9 +1966,10 @@ define <32 x i16> @ugt_6_v32i16(<32 x i16> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ugt_6_v32i16:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -1930,9 +1997,10 @@ define <32 x i16> @ugt_6_v32i16(<32 x i16> %0) {
 define <32 x i16> @ult_7_v32i16(<32 x i16> %0) {
 ; AVX512F-LABEL: ult_7_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -1951,7 +2019,7 @@ define <32 x i16> @ult_7_v32i16(<32 x i16> %0) {
 ; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; AVX512F-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpcmpgtw %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1959,9 +2027,10 @@ define <32 x i16> @ult_7_v32i16(<32 x i16> %0) {
 ;
 ; AVX512BW-LABEL: ult_7_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -1983,7 +2052,7 @@ define <32 x i16> @ult_7_v32i16(<32 x i16> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -1991,9 +2060,10 @@ define <32 x i16> @ult_7_v32i16(<32 x i16> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ult_7_v32i16:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -2021,9 +2091,10 @@ define <32 x i16> @ult_7_v32i16(<32 x i16> %0) {
 define <32 x i16> @ugt_7_v32i16(<32 x i16> %0) {
 ; AVX512F-LABEL: ugt_7_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -2042,7 +2113,7 @@ define <32 x i16> @ugt_7_v32i16(<32 x i16> %0) {
 ; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2050,9 +2121,10 @@ define <32 x i16> @ugt_7_v32i16(<32 x i16> %0) {
 ;
 ; AVX512BW-LABEL: ugt_7_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -2074,7 +2146,7 @@ define <32 x i16> @ugt_7_v32i16(<32 x i16> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm2, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm2, %ymm1, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2082,9 +2154,10 @@ define <32 x i16> @ugt_7_v32i16(<32 x i16> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ugt_7_v32i16:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -2112,9 +2185,10 @@ define <32 x i16> @ugt_7_v32i16(<32 x i16> %0) {
 define <32 x i16> @ult_8_v32i16(<32 x i16> %0) {
 ; AVX512F-LABEL: ult_8_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -2133,7 +2207,7 @@ define <32 x i16> @ult_8_v32i16(<32 x i16> %0) {
 ; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; AVX512F-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpcmpgtw %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2141,9 +2215,10 @@ define <32 x i16> @ult_8_v32i16(<32 x i16> %0) {
 ;
 ; AVX512BW-LABEL: ult_8_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -2165,7 +2240,7 @@ define <32 x i16> @ult_8_v32i16(<32 x i16> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2173,9 +2248,10 @@ define <32 x i16> @ult_8_v32i16(<32 x i16> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ult_8_v32i16:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -2203,9 +2279,10 @@ define <32 x i16> @ult_8_v32i16(<32 x i16> %0) {
 define <32 x i16> @ugt_8_v32i16(<32 x i16> %0) {
 ; AVX512F-LABEL: ugt_8_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -2224,7 +2301,7 @@ define <32 x i16> @ugt_8_v32i16(<32 x i16> %0) {
 ; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2232,9 +2309,10 @@ define <32 x i16> @ugt_8_v32i16(<32 x i16> %0) {
 ;
 ; AVX512BW-LABEL: ugt_8_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -2256,7 +2334,7 @@ define <32 x i16> @ugt_8_v32i16(<32 x i16> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm2, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm2, %ymm1, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2264,9 +2342,10 @@ define <32 x i16> @ugt_8_v32i16(<32 x i16> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ugt_8_v32i16:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -2294,9 +2373,10 @@ define <32 x i16> @ugt_8_v32i16(<32 x i16> %0) {
 define <32 x i16> @ult_9_v32i16(<32 x i16> %0) {
 ; AVX512F-LABEL: ult_9_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -2315,7 +2395,7 @@ define <32 x i16> @ult_9_v32i16(<32 x i16> %0) {
 ; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
 ; AVX512F-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpcmpgtw %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2323,9 +2403,10 @@ define <32 x i16> @ult_9_v32i16(<32 x i16> %0) {
 ;
 ; AVX512BW-LABEL: ult_9_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -2347,7 +2428,7 @@ define <32 x i16> @ult_9_v32i16(<32 x i16> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2355,9 +2436,10 @@ define <32 x i16> @ult_9_v32i16(<32 x i16> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ult_9_v32i16:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -2385,9 +2467,10 @@ define <32 x i16> @ult_9_v32i16(<32 x i16> %0) {
 define <32 x i16> @ugt_9_v32i16(<32 x i16> %0) {
 ; AVX512F-LABEL: ugt_9_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -2406,7 +2489,7 @@ define <32 x i16> @ugt_9_v32i16(<32 x i16> %0) {
 ; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2414,9 +2497,10 @@ define <32 x i16> @ugt_9_v32i16(<32 x i16> %0) {
 ;
 ; AVX512BW-LABEL: ugt_9_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -2438,7 +2522,7 @@ define <32 x i16> @ugt_9_v32i16(<32 x i16> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm2, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm2, %ymm1, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2446,9 +2530,10 @@ define <32 x i16> @ugt_9_v32i16(<32 x i16> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ugt_9_v32i16:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -2476,9 +2561,10 @@ define <32 x i16> @ugt_9_v32i16(<32 x i16> %0) {
 define <32 x i16> @ult_10_v32i16(<32 x i16> %0) {
 ; AVX512F-LABEL: ult_10_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -2497,7 +2583,7 @@ define <32 x i16> @ult_10_v32i16(<32 x i16> %0) {
 ; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
 ; AVX512F-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpcmpgtw %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2505,9 +2591,10 @@ define <32 x i16> @ult_10_v32i16(<32 x i16> %0) {
 ;
 ; AVX512BW-LABEL: ult_10_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -2529,7 +2616,7 @@ define <32 x i16> @ult_10_v32i16(<32 x i16> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2537,9 +2624,10 @@ define <32 x i16> @ult_10_v32i16(<32 x i16> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ult_10_v32i16:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -2567,9 +2655,10 @@ define <32 x i16> @ult_10_v32i16(<32 x i16> %0) {
 define <32 x i16> @ugt_10_v32i16(<32 x i16> %0) {
 ; AVX512F-LABEL: ugt_10_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -2588,7 +2677,7 @@ define <32 x i16> @ugt_10_v32i16(<32 x i16> %0) {
 ; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2596,9 +2685,10 @@ define <32 x i16> @ugt_10_v32i16(<32 x i16> %0) {
 ;
 ; AVX512BW-LABEL: ugt_10_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -2620,7 +2710,7 @@ define <32 x i16> @ugt_10_v32i16(<32 x i16> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm2, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm2, %ymm1, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2628,9 +2718,10 @@ define <32 x i16> @ugt_10_v32i16(<32 x i16> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ugt_10_v32i16:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -2658,9 +2749,10 @@ define <32 x i16> @ugt_10_v32i16(<32 x i16> %0) {
 define <32 x i16> @ult_11_v32i16(<32 x i16> %0) {
 ; AVX512F-LABEL: ult_11_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -2679,7 +2771,7 @@ define <32 x i16> @ult_11_v32i16(<32 x i16> %0) {
 ; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
 ; AVX512F-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpcmpgtw %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2687,9 +2779,10 @@ define <32 x i16> @ult_11_v32i16(<32 x i16> %0) {
 ;
 ; AVX512BW-LABEL: ult_11_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -2711,7 +2804,7 @@ define <32 x i16> @ult_11_v32i16(<32 x i16> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2719,9 +2812,10 @@ define <32 x i16> @ult_11_v32i16(<32 x i16> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ult_11_v32i16:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -2749,9 +2843,10 @@ define <32 x i16> @ult_11_v32i16(<32 x i16> %0) {
 define <32 x i16> @ugt_11_v32i16(<32 x i16> %0) {
 ; AVX512F-LABEL: ugt_11_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -2770,7 +2865,7 @@ define <32 x i16> @ugt_11_v32i16(<32 x i16> %0) {
 ; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2778,9 +2873,10 @@ define <32 x i16> @ugt_11_v32i16(<32 x i16> %0) {
 ;
 ; AVX512BW-LABEL: ugt_11_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -2802,7 +2898,7 @@ define <32 x i16> @ugt_11_v32i16(<32 x i16> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm2, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm2, %ymm1, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2810,9 +2906,10 @@ define <32 x i16> @ugt_11_v32i16(<32 x i16> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ugt_11_v32i16:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -2840,9 +2937,10 @@ define <32 x i16> @ugt_11_v32i16(<32 x i16> %0) {
 define <32 x i16> @ult_12_v32i16(<32 x i16> %0) {
 ; AVX512F-LABEL: ult_12_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -2861,7 +2959,7 @@ define <32 x i16> @ult_12_v32i16(<32 x i16> %0) {
 ; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
 ; AVX512F-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpcmpgtw %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2869,9 +2967,10 @@ define <32 x i16> @ult_12_v32i16(<32 x i16> %0) {
 ;
 ; AVX512BW-LABEL: ult_12_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -2893,7 +2992,7 @@ define <32 x i16> @ult_12_v32i16(<32 x i16> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2901,9 +3000,10 @@ define <32 x i16> @ult_12_v32i16(<32 x i16> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ult_12_v32i16:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -2931,9 +3031,10 @@ define <32 x i16> @ult_12_v32i16(<32 x i16> %0) {
 define <32 x i16> @ugt_12_v32i16(<32 x i16> %0) {
 ; AVX512F-LABEL: ugt_12_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -2952,7 +3053,7 @@ define <32 x i16> @ugt_12_v32i16(<32 x i16> %0) {
 ; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2960,9 +3061,10 @@ define <32 x i16> @ugt_12_v32i16(<32 x i16> %0) {
 ;
 ; AVX512BW-LABEL: ugt_12_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -2984,7 +3086,7 @@ define <32 x i16> @ugt_12_v32i16(<32 x i16> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm2, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm2, %ymm1, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -2992,9 +3094,10 @@ define <32 x i16> @ugt_12_v32i16(<32 x i16> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ugt_12_v32i16:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -3022,9 +3125,10 @@ define <32 x i16> @ugt_12_v32i16(<32 x i16> %0) {
 define <32 x i16> @ult_13_v32i16(<32 x i16> %0) {
 ; AVX512F-LABEL: ult_13_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -3043,7 +3147,7 @@ define <32 x i16> @ult_13_v32i16(<32 x i16> %0) {
 ; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
 ; AVX512F-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpcmpgtw %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -3051,9 +3155,10 @@ define <32 x i16> @ult_13_v32i16(<32 x i16> %0) {
 ;
 ; AVX512BW-LABEL: ult_13_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -3075,7 +3180,7 @@ define <32 x i16> @ult_13_v32i16(<32 x i16> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -3083,9 +3188,10 @@ define <32 x i16> @ult_13_v32i16(<32 x i16> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ult_13_v32i16:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -3113,9 +3219,10 @@ define <32 x i16> @ult_13_v32i16(<32 x i16> %0) {
 define <32 x i16> @ugt_13_v32i16(<32 x i16> %0) {
 ; AVX512F-LABEL: ugt_13_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -3134,7 +3241,7 @@ define <32 x i16> @ugt_13_v32i16(<32 x i16> %0) {
 ; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -3142,9 +3249,10 @@ define <32 x i16> @ugt_13_v32i16(<32 x i16> %0) {
 ;
 ; AVX512BW-LABEL: ugt_13_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -3166,7 +3274,7 @@ define <32 x i16> @ugt_13_v32i16(<32 x i16> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm2, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm2, %ymm1, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -3174,9 +3282,10 @@ define <32 x i16> @ugt_13_v32i16(<32 x i16> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ugt_13_v32i16:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -3204,9 +3313,10 @@ define <32 x i16> @ugt_13_v32i16(<32 x i16> %0) {
 define <32 x i16> @ult_14_v32i16(<32 x i16> %0) {
 ; AVX512F-LABEL: ult_14_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -3225,7 +3335,7 @@ define <32 x i16> @ult_14_v32i16(<32 x i16> %0) {
 ; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
 ; AVX512F-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpcmpgtw %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -3233,9 +3343,10 @@ define <32 x i16> @ult_14_v32i16(<32 x i16> %0) {
 ;
 ; AVX512BW-LABEL: ult_14_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -3257,7 +3368,7 @@ define <32 x i16> @ult_14_v32i16(<32 x i16> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -3265,9 +3376,10 @@ define <32 x i16> @ult_14_v32i16(<32 x i16> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ult_14_v32i16:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -3295,9 +3407,10 @@ define <32 x i16> @ult_14_v32i16(<32 x i16> %0) {
 define <32 x i16> @ugt_14_v32i16(<32 x i16> %0) {
 ; AVX512F-LABEL: ugt_14_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -3316,7 +3429,7 @@ define <32 x i16> @ugt_14_v32i16(<32 x i16> %0) {
 ; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -3324,9 +3437,10 @@ define <32 x i16> @ugt_14_v32i16(<32 x i16> %0) {
 ;
 ; AVX512BW-LABEL: ugt_14_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -3348,7 +3462,7 @@ define <32 x i16> @ugt_14_v32i16(<32 x i16> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm2, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm2, %ymm1, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -3356,9 +3470,10 @@ define <32 x i16> @ugt_14_v32i16(<32 x i16> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ugt_14_v32i16:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -3386,9 +3501,10 @@ define <32 x i16> @ugt_14_v32i16(<32 x i16> %0) {
 define <32 x i16> @ult_15_v32i16(<32 x i16> %0) {
 ; AVX512F-LABEL: ult_15_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -3407,7 +3523,7 @@ define <32 x i16> @ult_15_v32i16(<32 x i16> %0) {
 ; AVX512F-NEXT:    vpsllw $8, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpcmpgtw %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -3415,9 +3531,10 @@ define <32 x i16> @ult_15_v32i16(<32 x i16> %0) {
 ;
 ; AVX512BW-LABEL: ult_15_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -3439,7 +3556,7 @@ define <32 x i16> @ult_15_v32i16(<32 x i16> %0) {
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -3447,9 +3564,10 @@ define <32 x i16> @ult_15_v32i16(<32 x i16> %0) {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: ult_15_v32i16:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -3552,9 +3670,10 @@ define <16 x i32> @ugt_2_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ugt_2_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -3584,9 +3703,10 @@ define <16 x i32> @ugt_2_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ugt_2_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -3631,9 +3751,10 @@ define <16 x i32> @ult_3_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ult_3_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -3663,9 +3784,10 @@ define <16 x i32> @ult_3_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ult_3_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -3710,9 +3832,10 @@ define <16 x i32> @ugt_3_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ugt_3_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -3742,9 +3865,10 @@ define <16 x i32> @ugt_3_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ugt_3_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -3789,9 +3913,10 @@ define <16 x i32> @ult_4_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ult_4_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -3821,9 +3946,10 @@ define <16 x i32> @ult_4_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ult_4_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -3868,9 +3994,10 @@ define <16 x i32> @ugt_4_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ugt_4_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -3900,9 +4027,10 @@ define <16 x i32> @ugt_4_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ugt_4_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -3947,9 +4075,10 @@ define <16 x i32> @ult_5_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ult_5_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -3979,9 +4108,10 @@ define <16 x i32> @ult_5_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ult_5_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -4026,9 +4156,10 @@ define <16 x i32> @ugt_5_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ugt_5_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -4058,9 +4189,10 @@ define <16 x i32> @ugt_5_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ugt_5_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -4105,9 +4237,10 @@ define <16 x i32> @ult_6_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ult_6_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -4137,9 +4270,10 @@ define <16 x i32> @ult_6_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ult_6_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -4184,9 +4318,10 @@ define <16 x i32> @ugt_6_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ugt_6_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -4216,9 +4351,10 @@ define <16 x i32> @ugt_6_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ugt_6_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -4263,9 +4399,10 @@ define <16 x i32> @ult_7_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ult_7_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -4295,9 +4432,10 @@ define <16 x i32> @ult_7_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ult_7_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -4342,9 +4480,10 @@ define <16 x i32> @ugt_7_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ugt_7_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -4374,9 +4513,10 @@ define <16 x i32> @ugt_7_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ugt_7_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -4421,9 +4561,10 @@ define <16 x i32> @ult_8_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ult_8_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -4453,9 +4594,10 @@ define <16 x i32> @ult_8_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ult_8_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -4500,9 +4642,10 @@ define <16 x i32> @ugt_8_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ugt_8_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -4532,9 +4675,10 @@ define <16 x i32> @ugt_8_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ugt_8_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -4579,9 +4723,10 @@ define <16 x i32> @ult_9_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ult_9_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -4611,9 +4756,10 @@ define <16 x i32> @ult_9_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ult_9_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -4658,9 +4804,10 @@ define <16 x i32> @ugt_9_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ugt_9_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -4690,9 +4837,10 @@ define <16 x i32> @ugt_9_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ugt_9_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -4737,9 +4885,10 @@ define <16 x i32> @ult_10_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ult_10_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -4769,9 +4918,10 @@ define <16 x i32> @ult_10_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ult_10_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -4816,9 +4966,10 @@ define <16 x i32> @ugt_10_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ugt_10_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -4848,9 +4999,10 @@ define <16 x i32> @ugt_10_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ugt_10_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -4895,9 +5047,10 @@ define <16 x i32> @ult_11_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ult_11_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -4927,9 +5080,10 @@ define <16 x i32> @ult_11_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ult_11_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -4974,9 +5128,10 @@ define <16 x i32> @ugt_11_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ugt_11_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -5006,9 +5161,10 @@ define <16 x i32> @ugt_11_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ugt_11_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -5053,9 +5209,10 @@ define <16 x i32> @ult_12_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ult_12_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -5085,9 +5242,10 @@ define <16 x i32> @ult_12_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ult_12_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -5132,9 +5290,10 @@ define <16 x i32> @ugt_12_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ugt_12_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -5164,9 +5323,10 @@ define <16 x i32> @ugt_12_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ugt_12_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -5211,9 +5371,10 @@ define <16 x i32> @ult_13_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ult_13_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -5243,9 +5404,10 @@ define <16 x i32> @ult_13_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ult_13_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -5290,9 +5452,10 @@ define <16 x i32> @ugt_13_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ugt_13_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -5322,9 +5485,10 @@ define <16 x i32> @ugt_13_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ugt_13_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -5369,9 +5533,10 @@ define <16 x i32> @ult_14_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ult_14_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -5401,9 +5566,10 @@ define <16 x i32> @ult_14_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ult_14_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -5448,9 +5614,10 @@ define <16 x i32> @ugt_14_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ugt_14_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -5480,9 +5647,10 @@ define <16 x i32> @ugt_14_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ugt_14_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -5527,9 +5695,10 @@ define <16 x i32> @ult_15_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ult_15_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -5559,9 +5728,10 @@ define <16 x i32> @ult_15_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ult_15_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -5606,9 +5776,10 @@ define <16 x i32> @ugt_15_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ugt_15_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -5638,9 +5809,10 @@ define <16 x i32> @ugt_15_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ugt_15_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -5685,9 +5857,10 @@ define <16 x i32> @ult_16_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ult_16_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -5717,9 +5890,10 @@ define <16 x i32> @ult_16_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ult_16_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -5764,9 +5938,10 @@ define <16 x i32> @ugt_16_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ugt_16_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -5796,9 +5971,10 @@ define <16 x i32> @ugt_16_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ugt_16_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -5843,9 +6019,10 @@ define <16 x i32> @ult_17_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ult_17_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -5875,9 +6052,10 @@ define <16 x i32> @ult_17_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ult_17_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -5922,9 +6100,10 @@ define <16 x i32> @ugt_17_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ugt_17_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -5954,9 +6133,10 @@ define <16 x i32> @ugt_17_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ugt_17_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -6001,9 +6181,10 @@ define <16 x i32> @ult_18_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ult_18_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -6033,9 +6214,10 @@ define <16 x i32> @ult_18_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ult_18_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -6080,9 +6262,10 @@ define <16 x i32> @ugt_18_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ugt_18_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -6112,9 +6295,10 @@ define <16 x i32> @ugt_18_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ugt_18_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -6159,9 +6343,10 @@ define <16 x i32> @ult_19_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ult_19_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -6191,9 +6376,10 @@ define <16 x i32> @ult_19_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ult_19_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -6238,9 +6424,10 @@ define <16 x i32> @ugt_19_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ugt_19_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -6270,9 +6457,10 @@ define <16 x i32> @ugt_19_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ugt_19_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -6317,9 +6505,10 @@ define <16 x i32> @ult_20_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ult_20_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -6349,9 +6538,10 @@ define <16 x i32> @ult_20_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ult_20_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -6396,9 +6586,10 @@ define <16 x i32> @ugt_20_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ugt_20_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -6428,9 +6619,10 @@ define <16 x i32> @ugt_20_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ugt_20_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -6475,9 +6667,10 @@ define <16 x i32> @ult_21_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ult_21_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -6507,9 +6700,10 @@ define <16 x i32> @ult_21_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ult_21_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -6554,9 +6748,10 @@ define <16 x i32> @ugt_21_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ugt_21_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -6586,9 +6781,10 @@ define <16 x i32> @ugt_21_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ugt_21_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -6633,9 +6829,10 @@ define <16 x i32> @ult_22_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ult_22_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -6665,9 +6862,10 @@ define <16 x i32> @ult_22_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ult_22_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -6712,9 +6910,10 @@ define <16 x i32> @ugt_22_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ugt_22_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -6744,9 +6943,10 @@ define <16 x i32> @ugt_22_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ugt_22_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -6791,9 +6991,10 @@ define <16 x i32> @ult_23_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ult_23_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -6823,9 +7024,10 @@ define <16 x i32> @ult_23_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ult_23_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -6870,9 +7072,10 @@ define <16 x i32> @ugt_23_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ugt_23_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -6902,9 +7105,10 @@ define <16 x i32> @ugt_23_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ugt_23_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -6949,9 +7153,10 @@ define <16 x i32> @ult_24_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ult_24_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -6981,9 +7186,10 @@ define <16 x i32> @ult_24_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ult_24_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -7028,9 +7234,10 @@ define <16 x i32> @ugt_24_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ugt_24_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -7060,9 +7267,10 @@ define <16 x i32> @ugt_24_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ugt_24_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -7107,9 +7315,10 @@ define <16 x i32> @ult_25_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ult_25_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -7139,9 +7348,10 @@ define <16 x i32> @ult_25_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ult_25_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -7186,9 +7396,10 @@ define <16 x i32> @ugt_25_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ugt_25_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -7218,9 +7429,10 @@ define <16 x i32> @ugt_25_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ugt_25_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -7265,9 +7477,10 @@ define <16 x i32> @ult_26_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ult_26_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -7297,9 +7510,10 @@ define <16 x i32> @ult_26_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ult_26_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -7344,9 +7558,10 @@ define <16 x i32> @ugt_26_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ugt_26_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -7376,9 +7591,10 @@ define <16 x i32> @ugt_26_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ugt_26_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -7423,9 +7639,10 @@ define <16 x i32> @ult_27_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ult_27_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -7455,9 +7672,10 @@ define <16 x i32> @ult_27_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ult_27_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -7502,9 +7720,10 @@ define <16 x i32> @ugt_27_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ugt_27_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -7534,9 +7753,10 @@ define <16 x i32> @ugt_27_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ugt_27_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -7581,9 +7801,10 @@ define <16 x i32> @ult_28_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ult_28_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -7613,9 +7834,10 @@ define <16 x i32> @ult_28_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ult_28_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -7660,9 +7882,10 @@ define <16 x i32> @ugt_28_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ugt_28_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -7692,9 +7915,10 @@ define <16 x i32> @ugt_28_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ugt_28_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -7739,9 +7963,10 @@ define <16 x i32> @ult_29_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ult_29_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -7771,9 +7996,10 @@ define <16 x i32> @ult_29_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ult_29_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -7818,9 +8044,10 @@ define <16 x i32> @ugt_29_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ugt_29_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -7850,9 +8077,10 @@ define <16 x i32> @ugt_29_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ugt_29_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -7897,9 +8125,10 @@ define <16 x i32> @ult_30_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ult_30_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -7929,9 +8158,10 @@ define <16 x i32> @ult_30_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ult_30_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -7976,9 +8206,10 @@ define <16 x i32> @ugt_30_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ugt_30_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -8008,9 +8239,10 @@ define <16 x i32> @ugt_30_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ugt_30_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -8055,9 +8287,10 @@ define <16 x i32> @ult_31_v16i32(<16 x i32> %0) {
 ; AVX512F-LABEL: ult_31_v16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -8087,9 +8320,10 @@ define <16 x i32> @ult_31_v16i32(<16 x i32> %0) {
 ;
 ; AVX512BW-LABEL: ult_31_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -8208,9 +8442,10 @@ define <8 x i64> @ugt_2_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_2_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -8232,9 +8467,10 @@ define <8 x i64> @ugt_2_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_2_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -8271,9 +8507,10 @@ define <8 x i64> @ult_3_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_3_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -8295,9 +8532,10 @@ define <8 x i64> @ult_3_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_3_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -8334,9 +8572,10 @@ define <8 x i64> @ugt_3_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_3_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -8358,9 +8597,10 @@ define <8 x i64> @ugt_3_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_3_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -8397,9 +8637,10 @@ define <8 x i64> @ult_4_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_4_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -8421,9 +8662,10 @@ define <8 x i64> @ult_4_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_4_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -8460,9 +8702,10 @@ define <8 x i64> @ugt_4_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_4_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -8484,9 +8727,10 @@ define <8 x i64> @ugt_4_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_4_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -8523,9 +8767,10 @@ define <8 x i64> @ult_5_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_5_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -8547,9 +8792,10 @@ define <8 x i64> @ult_5_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_5_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -8586,9 +8832,10 @@ define <8 x i64> @ugt_5_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_5_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -8610,9 +8857,10 @@ define <8 x i64> @ugt_5_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_5_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -8649,9 +8897,10 @@ define <8 x i64> @ult_6_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_6_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -8673,9 +8922,10 @@ define <8 x i64> @ult_6_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_6_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -8712,9 +8962,10 @@ define <8 x i64> @ugt_6_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_6_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -8736,9 +8987,10 @@ define <8 x i64> @ugt_6_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_6_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -8775,9 +9027,10 @@ define <8 x i64> @ult_7_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_7_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -8799,9 +9052,10 @@ define <8 x i64> @ult_7_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_7_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -8838,9 +9092,10 @@ define <8 x i64> @ugt_7_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_7_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -8862,9 +9117,10 @@ define <8 x i64> @ugt_7_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_7_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -8901,9 +9157,10 @@ define <8 x i64> @ult_8_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_8_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -8925,9 +9182,10 @@ define <8 x i64> @ult_8_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_8_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -8964,9 +9222,10 @@ define <8 x i64> @ugt_8_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_8_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -8988,9 +9247,10 @@ define <8 x i64> @ugt_8_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_8_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -9027,9 +9287,10 @@ define <8 x i64> @ult_9_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_9_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -9051,9 +9312,10 @@ define <8 x i64> @ult_9_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_9_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -9090,9 +9352,10 @@ define <8 x i64> @ugt_9_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_9_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -9114,9 +9377,10 @@ define <8 x i64> @ugt_9_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_9_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -9153,9 +9417,10 @@ define <8 x i64> @ult_10_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_10_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -9177,9 +9442,10 @@ define <8 x i64> @ult_10_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_10_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -9216,9 +9482,10 @@ define <8 x i64> @ugt_10_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_10_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -9240,9 +9507,10 @@ define <8 x i64> @ugt_10_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_10_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -9279,9 +9547,10 @@ define <8 x i64> @ult_11_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_11_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -9303,9 +9572,10 @@ define <8 x i64> @ult_11_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_11_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -9342,9 +9612,10 @@ define <8 x i64> @ugt_11_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_11_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -9366,9 +9637,10 @@ define <8 x i64> @ugt_11_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_11_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -9405,9 +9677,10 @@ define <8 x i64> @ult_12_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_12_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -9429,9 +9702,10 @@ define <8 x i64> @ult_12_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_12_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -9468,9 +9742,10 @@ define <8 x i64> @ugt_12_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_12_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -9492,9 +9767,10 @@ define <8 x i64> @ugt_12_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_12_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -9531,9 +9807,10 @@ define <8 x i64> @ult_13_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_13_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -9555,9 +9832,10 @@ define <8 x i64> @ult_13_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_13_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -9594,9 +9872,10 @@ define <8 x i64> @ugt_13_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_13_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -9618,9 +9897,10 @@ define <8 x i64> @ugt_13_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_13_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -9657,9 +9937,10 @@ define <8 x i64> @ult_14_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_14_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -9681,9 +9962,10 @@ define <8 x i64> @ult_14_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_14_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -9720,9 +10002,10 @@ define <8 x i64> @ugt_14_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_14_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -9744,9 +10027,10 @@ define <8 x i64> @ugt_14_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_14_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -9783,9 +10067,10 @@ define <8 x i64> @ult_15_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_15_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -9807,9 +10092,10 @@ define <8 x i64> @ult_15_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_15_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -9846,9 +10132,10 @@ define <8 x i64> @ugt_15_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_15_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -9870,9 +10157,10 @@ define <8 x i64> @ugt_15_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_15_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -9909,9 +10197,10 @@ define <8 x i64> @ult_16_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_16_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -9933,9 +10222,10 @@ define <8 x i64> @ult_16_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_16_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -9972,9 +10262,10 @@ define <8 x i64> @ugt_16_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_16_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -9996,9 +10287,10 @@ define <8 x i64> @ugt_16_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_16_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -10035,9 +10327,10 @@ define <8 x i64> @ult_17_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_17_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -10059,9 +10352,10 @@ define <8 x i64> @ult_17_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_17_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -10098,9 +10392,10 @@ define <8 x i64> @ugt_17_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_17_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -10122,9 +10417,10 @@ define <8 x i64> @ugt_17_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_17_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -10161,9 +10457,10 @@ define <8 x i64> @ult_18_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_18_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -10185,9 +10482,10 @@ define <8 x i64> @ult_18_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_18_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -10224,9 +10522,10 @@ define <8 x i64> @ugt_18_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_18_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -10248,9 +10547,10 @@ define <8 x i64> @ugt_18_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_18_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -10287,9 +10587,10 @@ define <8 x i64> @ult_19_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_19_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -10311,9 +10612,10 @@ define <8 x i64> @ult_19_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_19_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -10350,9 +10652,10 @@ define <8 x i64> @ugt_19_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_19_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -10374,9 +10677,10 @@ define <8 x i64> @ugt_19_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_19_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -10413,9 +10717,10 @@ define <8 x i64> @ult_20_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_20_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -10437,9 +10742,10 @@ define <8 x i64> @ult_20_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_20_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -10476,9 +10782,10 @@ define <8 x i64> @ugt_20_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_20_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -10500,9 +10807,10 @@ define <8 x i64> @ugt_20_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_20_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -10539,9 +10847,10 @@ define <8 x i64> @ult_21_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_21_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -10563,9 +10872,10 @@ define <8 x i64> @ult_21_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_21_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -10602,9 +10912,10 @@ define <8 x i64> @ugt_21_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_21_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -10626,9 +10937,10 @@ define <8 x i64> @ugt_21_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_21_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -10665,9 +10977,10 @@ define <8 x i64> @ult_22_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_22_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -10689,9 +11002,10 @@ define <8 x i64> @ult_22_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_22_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -10728,9 +11042,10 @@ define <8 x i64> @ugt_22_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_22_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -10752,9 +11067,10 @@ define <8 x i64> @ugt_22_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_22_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -10791,9 +11107,10 @@ define <8 x i64> @ult_23_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_23_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -10815,9 +11132,10 @@ define <8 x i64> @ult_23_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_23_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -10854,9 +11172,10 @@ define <8 x i64> @ugt_23_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_23_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -10878,9 +11197,10 @@ define <8 x i64> @ugt_23_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_23_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -10917,9 +11237,10 @@ define <8 x i64> @ult_24_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_24_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -10941,9 +11262,10 @@ define <8 x i64> @ult_24_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_24_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -10980,9 +11302,10 @@ define <8 x i64> @ugt_24_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_24_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -11004,9 +11327,10 @@ define <8 x i64> @ugt_24_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_24_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -11043,9 +11367,10 @@ define <8 x i64> @ult_25_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_25_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -11067,9 +11392,10 @@ define <8 x i64> @ult_25_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_25_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -11106,9 +11432,10 @@ define <8 x i64> @ugt_25_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_25_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -11130,9 +11457,10 @@ define <8 x i64> @ugt_25_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_25_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -11169,9 +11497,10 @@ define <8 x i64> @ult_26_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_26_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -11193,9 +11522,10 @@ define <8 x i64> @ult_26_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_26_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -11232,9 +11562,10 @@ define <8 x i64> @ugt_26_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_26_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -11256,9 +11587,10 @@ define <8 x i64> @ugt_26_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_26_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -11295,9 +11627,10 @@ define <8 x i64> @ult_27_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_27_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -11319,9 +11652,10 @@ define <8 x i64> @ult_27_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_27_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -11358,9 +11692,10 @@ define <8 x i64> @ugt_27_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_27_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -11382,9 +11717,10 @@ define <8 x i64> @ugt_27_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_27_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -11421,9 +11757,10 @@ define <8 x i64> @ult_28_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_28_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -11445,9 +11782,10 @@ define <8 x i64> @ult_28_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_28_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -11484,9 +11822,10 @@ define <8 x i64> @ugt_28_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_28_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -11508,9 +11847,10 @@ define <8 x i64> @ugt_28_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_28_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -11547,9 +11887,10 @@ define <8 x i64> @ult_29_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_29_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -11571,9 +11912,10 @@ define <8 x i64> @ult_29_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_29_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -11610,9 +11952,10 @@ define <8 x i64> @ugt_29_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_29_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -11634,9 +11977,10 @@ define <8 x i64> @ugt_29_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_29_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -11673,9 +12017,10 @@ define <8 x i64> @ult_30_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_30_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -11697,9 +12042,10 @@ define <8 x i64> @ult_30_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_30_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -11736,9 +12082,10 @@ define <8 x i64> @ugt_30_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_30_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -11760,9 +12107,10 @@ define <8 x i64> @ugt_30_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_30_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -11799,9 +12147,10 @@ define <8 x i64> @ult_31_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_31_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -11823,9 +12172,10 @@ define <8 x i64> @ult_31_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_31_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -11862,9 +12212,10 @@ define <8 x i64> @ugt_31_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_31_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -11886,9 +12237,10 @@ define <8 x i64> @ugt_31_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_31_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -11925,9 +12277,10 @@ define <8 x i64> @ult_32_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_32_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -11949,9 +12302,10 @@ define <8 x i64> @ult_32_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_32_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -11988,9 +12342,10 @@ define <8 x i64> @ugt_32_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_32_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -12012,9 +12367,10 @@ define <8 x i64> @ugt_32_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_32_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -12051,9 +12407,10 @@ define <8 x i64> @ult_33_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_33_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -12075,9 +12432,10 @@ define <8 x i64> @ult_33_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_33_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -12114,9 +12472,10 @@ define <8 x i64> @ugt_33_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_33_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -12138,9 +12497,10 @@ define <8 x i64> @ugt_33_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_33_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -12177,9 +12537,10 @@ define <8 x i64> @ult_34_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_34_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -12201,9 +12562,10 @@ define <8 x i64> @ult_34_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_34_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -12240,9 +12602,10 @@ define <8 x i64> @ugt_34_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_34_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -12264,9 +12627,10 @@ define <8 x i64> @ugt_34_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_34_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -12303,9 +12667,10 @@ define <8 x i64> @ult_35_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_35_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -12327,9 +12692,10 @@ define <8 x i64> @ult_35_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_35_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -12366,9 +12732,10 @@ define <8 x i64> @ugt_35_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_35_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -12390,9 +12757,10 @@ define <8 x i64> @ugt_35_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_35_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -12429,9 +12797,10 @@ define <8 x i64> @ult_36_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_36_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -12453,9 +12822,10 @@ define <8 x i64> @ult_36_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_36_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -12492,9 +12862,10 @@ define <8 x i64> @ugt_36_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_36_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -12516,9 +12887,10 @@ define <8 x i64> @ugt_36_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_36_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -12555,9 +12927,10 @@ define <8 x i64> @ult_37_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_37_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -12579,9 +12952,10 @@ define <8 x i64> @ult_37_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_37_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -12618,9 +12992,10 @@ define <8 x i64> @ugt_37_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_37_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -12642,9 +13017,10 @@ define <8 x i64> @ugt_37_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_37_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -12681,9 +13057,10 @@ define <8 x i64> @ult_38_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_38_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -12705,9 +13082,10 @@ define <8 x i64> @ult_38_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_38_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -12744,9 +13122,10 @@ define <8 x i64> @ugt_38_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_38_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -12768,9 +13147,10 @@ define <8 x i64> @ugt_38_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_38_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -12807,9 +13187,10 @@ define <8 x i64> @ult_39_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_39_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -12831,9 +13212,10 @@ define <8 x i64> @ult_39_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_39_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -12870,9 +13252,10 @@ define <8 x i64> @ugt_39_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_39_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -12894,9 +13277,10 @@ define <8 x i64> @ugt_39_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_39_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -12933,9 +13317,10 @@ define <8 x i64> @ult_40_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_40_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -12957,9 +13342,10 @@ define <8 x i64> @ult_40_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_40_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -12996,9 +13382,10 @@ define <8 x i64> @ugt_40_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_40_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -13020,9 +13407,10 @@ define <8 x i64> @ugt_40_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_40_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -13059,9 +13447,10 @@ define <8 x i64> @ult_41_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_41_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -13083,9 +13472,10 @@ define <8 x i64> @ult_41_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_41_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -13122,9 +13512,10 @@ define <8 x i64> @ugt_41_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_41_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -13146,9 +13537,10 @@ define <8 x i64> @ugt_41_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_41_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -13185,9 +13577,10 @@ define <8 x i64> @ult_42_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_42_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -13209,9 +13602,10 @@ define <8 x i64> @ult_42_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_42_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -13248,9 +13642,10 @@ define <8 x i64> @ugt_42_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_42_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -13272,9 +13667,10 @@ define <8 x i64> @ugt_42_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_42_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -13311,9 +13707,10 @@ define <8 x i64> @ult_43_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_43_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -13335,9 +13732,10 @@ define <8 x i64> @ult_43_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_43_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -13374,9 +13772,10 @@ define <8 x i64> @ugt_43_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_43_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -13398,9 +13797,10 @@ define <8 x i64> @ugt_43_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_43_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -13437,9 +13837,10 @@ define <8 x i64> @ult_44_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_44_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -13461,9 +13862,10 @@ define <8 x i64> @ult_44_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_44_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -13500,9 +13902,10 @@ define <8 x i64> @ugt_44_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_44_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -13524,9 +13927,10 @@ define <8 x i64> @ugt_44_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_44_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -13563,9 +13967,10 @@ define <8 x i64> @ult_45_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_45_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -13587,9 +13992,10 @@ define <8 x i64> @ult_45_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_45_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -13626,9 +14032,10 @@ define <8 x i64> @ugt_45_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_45_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -13650,9 +14057,10 @@ define <8 x i64> @ugt_45_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_45_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -13689,9 +14097,10 @@ define <8 x i64> @ult_46_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_46_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -13713,9 +14122,10 @@ define <8 x i64> @ult_46_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_46_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -13752,9 +14162,10 @@ define <8 x i64> @ugt_46_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_46_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -13776,9 +14187,10 @@ define <8 x i64> @ugt_46_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_46_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -13815,9 +14227,10 @@ define <8 x i64> @ult_47_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_47_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -13839,9 +14252,10 @@ define <8 x i64> @ult_47_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_47_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -13878,9 +14292,10 @@ define <8 x i64> @ugt_47_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_47_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -13902,9 +14317,10 @@ define <8 x i64> @ugt_47_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_47_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -13941,9 +14357,10 @@ define <8 x i64> @ult_48_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_48_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -13965,9 +14382,10 @@ define <8 x i64> @ult_48_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_48_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -14004,9 +14422,10 @@ define <8 x i64> @ugt_48_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_48_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -14028,9 +14447,10 @@ define <8 x i64> @ugt_48_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_48_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -14067,9 +14487,10 @@ define <8 x i64> @ult_49_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_49_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -14091,9 +14512,10 @@ define <8 x i64> @ult_49_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_49_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -14130,9 +14552,10 @@ define <8 x i64> @ugt_49_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_49_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -14154,9 +14577,10 @@ define <8 x i64> @ugt_49_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_49_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -14193,9 +14617,10 @@ define <8 x i64> @ult_50_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_50_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -14217,9 +14642,10 @@ define <8 x i64> @ult_50_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_50_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -14256,9 +14682,10 @@ define <8 x i64> @ugt_50_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_50_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -14280,9 +14707,10 @@ define <8 x i64> @ugt_50_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_50_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -14319,9 +14747,10 @@ define <8 x i64> @ult_51_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_51_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -14343,9 +14772,10 @@ define <8 x i64> @ult_51_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_51_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -14382,9 +14812,10 @@ define <8 x i64> @ugt_51_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_51_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -14406,9 +14837,10 @@ define <8 x i64> @ugt_51_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_51_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -14445,9 +14877,10 @@ define <8 x i64> @ult_52_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_52_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -14469,9 +14902,10 @@ define <8 x i64> @ult_52_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_52_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -14508,9 +14942,10 @@ define <8 x i64> @ugt_52_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_52_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -14532,9 +14967,10 @@ define <8 x i64> @ugt_52_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_52_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -14571,9 +15007,10 @@ define <8 x i64> @ult_53_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_53_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -14595,9 +15032,10 @@ define <8 x i64> @ult_53_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_53_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -14634,9 +15072,10 @@ define <8 x i64> @ugt_53_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_53_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -14658,9 +15097,10 @@ define <8 x i64> @ugt_53_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_53_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -14697,9 +15137,10 @@ define <8 x i64> @ult_54_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_54_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -14721,9 +15162,10 @@ define <8 x i64> @ult_54_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_54_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -14760,9 +15202,10 @@ define <8 x i64> @ugt_54_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_54_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -14784,9 +15227,10 @@ define <8 x i64> @ugt_54_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_54_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -14823,9 +15267,10 @@ define <8 x i64> @ult_55_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_55_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -14847,9 +15292,10 @@ define <8 x i64> @ult_55_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_55_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -14886,9 +15332,10 @@ define <8 x i64> @ugt_55_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_55_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -14910,9 +15357,10 @@ define <8 x i64> @ugt_55_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_55_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -14949,9 +15397,10 @@ define <8 x i64> @ult_56_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_56_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -14973,9 +15422,10 @@ define <8 x i64> @ult_56_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_56_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -15012,9 +15462,10 @@ define <8 x i64> @ugt_56_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_56_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -15036,9 +15487,10 @@ define <8 x i64> @ugt_56_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_56_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -15075,9 +15527,10 @@ define <8 x i64> @ult_57_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_57_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -15099,9 +15552,10 @@ define <8 x i64> @ult_57_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_57_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -15138,9 +15592,10 @@ define <8 x i64> @ugt_57_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_57_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -15162,9 +15617,10 @@ define <8 x i64> @ugt_57_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_57_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -15201,9 +15657,10 @@ define <8 x i64> @ult_58_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_58_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -15225,9 +15682,10 @@ define <8 x i64> @ult_58_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_58_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -15264,9 +15722,10 @@ define <8 x i64> @ugt_58_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_58_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -15288,9 +15747,10 @@ define <8 x i64> @ugt_58_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_58_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -15327,9 +15787,10 @@ define <8 x i64> @ult_59_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_59_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -15351,9 +15812,10 @@ define <8 x i64> @ult_59_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_59_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -15390,9 +15852,10 @@ define <8 x i64> @ugt_59_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_59_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -15414,9 +15877,10 @@ define <8 x i64> @ugt_59_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_59_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -15453,9 +15917,10 @@ define <8 x i64> @ult_60_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_60_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -15477,9 +15942,10 @@ define <8 x i64> @ult_60_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_60_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -15516,9 +15982,10 @@ define <8 x i64> @ugt_60_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_60_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -15540,9 +16007,10 @@ define <8 x i64> @ugt_60_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_60_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -15579,9 +16047,10 @@ define <8 x i64> @ult_61_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_61_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -15603,9 +16072,10 @@ define <8 x i64> @ult_61_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_61_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -15642,9 +16112,10 @@ define <8 x i64> @ugt_61_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_61_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -15666,9 +16137,10 @@ define <8 x i64> @ugt_61_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_61_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -15705,9 +16177,10 @@ define <8 x i64> @ult_62_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_62_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -15729,9 +16202,10 @@ define <8 x i64> @ult_62_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_62_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -15768,9 +16242,10 @@ define <8 x i64> @ugt_62_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ugt_62_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -15792,9 +16267,10 @@ define <8 x i64> @ugt_62_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ugt_62_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -15831,9 +16307,10 @@ define <8 x i64> @ult_63_v8i64(<8 x i64> %0) {
 ; AVX512F-LABEL: ult_63_v8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -15855,9 +16332,10 @@ define <8 x i64> @ult_63_v8i64(<8 x i64> %0) {
 ;
 ; AVX512BW-LABEL: ult_63_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0

diff  --git a/llvm/test/CodeGen/X86/vector-popcnt-512.ll b/llvm/test/CodeGen/X86/vector-popcnt-512.ll
index 552a27daf971a..1c1caf8ee4681 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-512.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-512.ll
@@ -9,9 +9,10 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
 ; AVX512F-LABEL: testv8i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -31,9 +32,10 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
 ;
 ; AVX512BW-LABEL: testv8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -62,9 +64,10 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 ; AVX512F-LABEL: testv16i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -92,9 +95,10 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 ;
 ; AVX512BW-LABEL: testv16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -130,9 +134,10 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
 ; AVX512F-LABEL: testv32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpand %ymm1, %ymm4, %ymm4
@@ -156,9 +161,10 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
 ;
 ; AVX512BW-LABEL: testv32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -183,9 +189,10 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: testv32i16:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -208,9 +215,10 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
 ; AVX512F-LABEL: testv64i8:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -227,9 +235,10 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
 ;
 ; AVX512BW-LABEL: testv64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -240,9 +249,10 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
 ; AVX512VPOPCNTDQ-NOBW-LABEL: testv64i8:
 ; AVX512VPOPCNTDQ-NOBW:       # %bb.0:
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512VPOPCNTDQ-NOBW-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512VPOPCNTDQ-NOBW-NEXT:    vpand %ymm2, %ymm1, %ymm1
@@ -259,9 +269,10 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-BW-LABEL: testv64i8:
 ; AVX512VPOPCNTDQ-BW:       # %bb.0:
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
index 320b63ee20bd5..03e39e71aaaf1 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
@@ -914,7 +914,7 @@ define i16 @test_v4i16_v4i8(<4 x i16> %a0) {
 ; AVX512BW-LABEL: test_v4i16_v4i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; AVX512BW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
@@ -1252,7 +1252,7 @@ define i16 @test_v64i16_v64i8(<64 x i16> %a0) {
 ;
 ; AVX2-LABEL: test_v64i16_v64i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
 ; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
 ; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
index 8814cd592a8e8..bcb3f4d14b10a 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
@@ -439,15 +439,35 @@ define i1 @trunc_v32i16_v32i1(<32 x i16>) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: trunc_v32i16_v32i1:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
-; AVX512-NEXT:    kortestw %k0, %k0
-; AVX512-NEXT:    sete %al
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: trunc_v32i16_v32i1:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
+; AVX512F-NEXT:    kortestw %k0, %k0
+; AVX512F-NEXT:    sete %al
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v32i16_v32i1:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kortestw %k0, %k0
+; AVX512BW-NEXT:    sete %al
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v32i16_v32i1:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastw {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VL-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
+; AVX512VL-NEXT:    kortestw %k0, %k0
+; AVX512VL-NEXT:    sete %al
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
   %a = trunc <32 x i16> %0 to <32 x i1>
   %b = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> %a)
   ret i1 %b
@@ -490,15 +510,35 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: trunc_v64i8_v64i1:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
-; AVX512-NEXT:    kortestw %k0, %k0
-; AVX512-NEXT:    sete %al
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: trunc_v64i8_v64i1:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
+; AVX512F-NEXT:    kortestw %k0, %k0
+; AVX512F-NEXT:    sete %al
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v64i8_v64i1:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kortestw %k0, %k0
+; AVX512BW-NEXT:    sete %al
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v64i8_v64i1:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VL-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
+; AVX512VL-NEXT:    kortestw %k0, %k0
+; AVX512VL-NEXT:    sete %al
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
   %a = trunc <64 x i8> %0 to <64 x i1>
   %b = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> %a)
   ret i1 %b

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll
index 36d7dba44b94a..92c2ebc83b142 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll
@@ -63,16 +63,27 @@ define i64 @test_v2i64(<2 x i64> %a0) {
 ; SSE42-NEXT:    movq %xmm2, %rax
 ; SSE42-NEXT:    retq
 ;
-; AVX-LABEL: test_v2i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm3
-; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm2
-; AVX-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vmovq %xmm0, %rax
-; AVX-NEXT:    retq
+; AVX1-LABEL: test_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm2
+; AVX2-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_v2i64:
 ; AVX512BW:       # %bb.0:

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-umin.ll b/llvm/test/CodeGen/X86/vector-reduce-umin.ll
index b745c97d5025b..5cade4eaaba63 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-umin.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-umin.ll
@@ -63,16 +63,27 @@ define i64 @test_v2i64(<2 x i64> %a0) {
 ; SSE42-NEXT:    movq %xmm2, %rax
 ; SSE42-NEXT:    retq
 ;
-; AVX-LABEL: test_v2i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm3
-; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm2
-; AVX-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vmovq %xmm0, %rax
-; AVX-NEXT:    retq
+; AVX1-LABEL: test_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm2
+; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_v2i64:
 ; AVX512BW:       # %bb.0:

diff  --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
index 4ad271dc23706..5676836f8f7ea 100644
--- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
@@ -469,7 +469,8 @@ define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
 ; AVX512BW-ONLY-NEXT:    kmovq (%rdi), %k0
 ; AVX512BW-ONLY-NEXT:    vpmovm2b %k0, %zmm0
 ; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,6,7,6,7]
-; AVX512BW-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-ONLY-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-ONLY-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-ONLY-NEXT:    vpshufb %zmm2, %zmm1, %zmm1
 ; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm1, %k1
 ; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3]

diff  --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll
index c350531e11b8a..1f35d333de68e 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-128.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll
@@ -73,7 +73,7 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ;
 ; AVX2-LABEL: var_rotate_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [64,64]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [64,64]
 ; AVX2-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
 ; AVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm1
 ; AVX2-NEXT:    vpsrlvq %xmm2, %xmm0, %xmm0
@@ -344,7 +344,7 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
 ; AVX512BW-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpor %xmm0, %xmm2, %xmm0
@@ -355,7 +355,7 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpsllvw %xmm1, %xmm0, %xmm2
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
 ; AVX512VLBW-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
 ; AVX512VLBW-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm0
 ; AVX512VLBW-NEXT:    vpor %xmm0, %xmm2, %xmm0
@@ -670,14 +670,23 @@ define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE-NEXT:    por %xmm3, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: splatvar_rotate_v2i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [64,64]
-; AVX-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
-; AVX-NEXT:    vpsllq %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vpsrlq %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: splatvar_rotate_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [64,64]
+; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrlq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_rotate_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [64,64]
+; AVX2-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
+; AVX2-NEXT:    vpsllq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpsrlq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512NOVLX-LABEL: splatvar_rotate_v2i64:
 ; AVX512NOVLX:       # %bb.0:

diff  --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll
index 4dc837be572c9..7cfdc5d6513ff 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-256.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll
@@ -259,7 +259,7 @@ define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX512BW-NEXT:    vpsubw %ymm1, %ymm3, %ymm1
 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpor %ymm0, %ymm2, %ymm0
@@ -269,7 +269,7 @@ define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
 ; AVX512VLBW-NEXT:    vpsllvw %ymm1, %ymm0, %ymm2
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX512VLBW-NEXT:    vpsubw %ymm1, %ymm3, %ymm1
 ; AVX512VLBW-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    vpor %ymm0, %ymm2, %ymm0
@@ -532,7 +532,7 @@ define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX2-LABEL: splatvar_rotate_v4i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpsllq %xmm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [64,64]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [64,64]
 ; AVX2-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
 ; AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
@@ -1107,11 +1107,13 @@ define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind {
 ;
 ; AVX512BW-LABEL: constant_rotate_v32i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
+; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vpsrlw $8, %ymm1, %ymm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
+; AVX512BW-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrlw $8, %ymm0, %ymm0
@@ -1131,11 +1133,13 @@ define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind {
 ;
 ; AVX512VBMI2-LABEL: constant_rotate_v32i8:
 ; AVX512VBMI2:       # %bb.0:
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
+; AVX512VBMI2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
+; AVX512VBMI2-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512VBMI2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512VBMI2-NEXT:    vpsllvw %zmm1, %zmm2, %zmm1
 ; AVX512VBMI2-NEXT:    vpsrlw $8, %ymm1, %ymm1
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
+; AVX512VBMI2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
+; AVX512VBMI2-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512VBMI2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512VBMI2-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
 ; AVX512VBMI2-NEXT:    vpsrlw $8, %ymm0, %ymm0

diff  --git a/llvm/test/CodeGen/X86/vector-rotate-512.ll b/llvm/test/CodeGen/X86/vector-rotate-512.ll
index 17f346138a92d..d3bb0e1deb9a1 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-512.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-512.ll
@@ -38,7 +38,7 @@ define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; AVX512F-LABEL: var_rotate_v32i16:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpxor %xmm4, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
@@ -67,7 +67,7 @@ define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; AVX512VL-LABEL: var_rotate_v32i16:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
@@ -97,7 +97,7 @@ define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX512BW-NEXT:    vpsubw %zmm1, %zmm3, %zmm1
 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
@@ -107,7 +107,7 @@ define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
 ; AVX512VLBW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm2
-; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX512VLBW-NEXT:    vpsubw %zmm1, %zmm3, %zmm1
 ; AVX512VLBW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vporq %zmm0, %zmm2, %zmm0
@@ -147,7 +147,7 @@ define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpsrlw $7, %ymm2, %ymm4
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX512F-NEXT:    vpand %ymm6, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm8
 ; AVX512F-NEXT:    vpor %ymm4, %ymm8, %ymm4
@@ -190,7 +190,7 @@ define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $7, %ymm2, %ymm4
 ; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm6
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX512VL-NEXT:    vpternlogq $248, %ymm8, %ymm4, %ymm6
 ; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm6, %ymm2, %ymm2

diff  --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
index df7a66a309ed7..8c03a1a03b880 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -66,7 +66,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ;
 ; AVX2-LABEL: var_shift_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpsrlvq %xmm1, %xmm2, %xmm2
 ; AVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
@@ -632,14 +632,23 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE-NEXT:    psubq %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: splatvar_shift_v2i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
-; AVX-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: splatvar_shift_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_shift_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
+; AVX2-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatvar_shift_v2i64:
 ; XOPAVX1:       # %bb.0:
@@ -837,7 +846,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; AVX2-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
 ; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
@@ -933,15 +942,25 @@ define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwi
 ; SSE-NEXT:    psubq %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: splatvar_modulo_shift_v2i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
-; AVX-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: splatvar_modulo_shift_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatvar_modulo_shift_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
+; AVX2-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatvar_modulo_shift_v2i64:
 ; XOPAVX1:       # %bb.0:
@@ -1130,7 +1149,7 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ; AVX2-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
 ; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
@@ -1708,14 +1727,23 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
 ; SSE-NEXT:    psubb %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: splatconstant_shift_v16i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm0
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: splatconstant_shift_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_shift_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; XOP-LABEL: splatconstant_shift_v16i8:
 ; XOP:       # %bb.0:
@@ -1726,18 +1754,26 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
-; AVX512VL-LABEL: splatconstant_shift_v16i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpsrlw $3, %xmm0, %xmm0
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
-; AVX512VL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
+; AVX512DQVL-LABEL: splatconstant_shift_v16i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512DQVL-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; AVX512DQVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: splatconstant_shift_v16i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BWVL-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; AVX512BWVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
 ;
 ; X86-SSE-LABEL: splatconstant_shift_v16i8:
 ; X86-SSE:       # %bb.0:
@@ -1762,15 +1798,25 @@ define <2 x i64> @PR52719(<2 x i64> %a0, i32 %a1) {
 ; SSE-NEXT:    psubq %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: PR52719:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovd %edi, %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
-; AVX-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: PR52719:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovd %edi, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: PR52719:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovd %edi, %xmm1
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
+; AVX2-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: PR52719:
 ; XOPAVX1:       # %bb.0:

diff  --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
index 99c735dec13c0..a7d4e88af0e6d 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -115,7 +115,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ;
 ; X86-AVX2-LABEL: var_shift_v4i64:
 ; X86-AVX2:       # %bb.0:
-; X86-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
 ; X86-AVX2-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm2
 ; X86-AVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
@@ -723,7 +723,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ;
 ; X86-AVX2-LABEL: splatvar_shift_v4i64:
 ; X86-AVX2:       # %bb.0:
-; X86-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
 ; X86-AVX2-NEXT:    vpsrlq %xmm1, %ymm2, %ymm2
 ; X86-AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
@@ -888,7 +888,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX2-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX2-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX2-NEXT:    vpsrlw %xmm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
@@ -925,7 +925,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX512DQ-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX512DQ-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX512DQ-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512DQ-NEXT:    vpsrlw %xmm1, %ymm2, %ymm1
 ; AVX512DQ-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
@@ -943,7 +943,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX512DQVL:       # %bb.0:
 ; AVX512DQVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512DQVL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX512DQVL-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
 ; AVX512DQVL-NEXT:    vpsrlw %xmm1, %ymm2, %ymm2
 ; AVX512DQVL-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
 ; AVX512DQVL-NEXT:    vpsrlw %xmm1, %xmm3, %xmm1
@@ -990,7 +990,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; X86-AVX2-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; X86-AVX2-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; X86-AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
+; X86-AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; X86-AVX2-NEXT:    vpsrlw %xmm1, %ymm2, %ymm1
 ; X86-AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
@@ -1084,7 +1084,7 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi
 ; X86-AVX2-LABEL: splatvar_modulo_shift_v4i64:
 ; X86-AVX2:       # %bb.0:
 ; X86-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
-; X86-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
 ; X86-AVX2-NEXT:    vpsrlq %xmm1, %ymm2, %ymm2
 ; X86-AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
@@ -1252,7 +1252,7 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
 ; AVX2-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX2-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX2-NEXT:    vpsrlw %xmm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
@@ -1291,7 +1291,7 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
 ; AVX512DQ-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX512DQ-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX512DQ-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512DQ-NEXT:    vpsrlw %xmm1, %ymm2, %ymm1
 ; AVX512DQ-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
@@ -1310,7 +1310,7 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
 ; AVX512DQVL:       # %bb.0:
 ; AVX512DQVL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX512DQVL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX512DQVL-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
 ; AVX512DQVL-NEXT:    vpsrlw %xmm1, %ymm2, %ymm2
 ; AVX512DQVL-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
 ; AVX512DQVL-NEXT:    vpsrlw %xmm1, %xmm3, %xmm1
@@ -1358,7 +1358,7 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
 ; X86-AVX2-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; X86-AVX2-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; X86-AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
+; X86-AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; X86-AVX2-NEXT:    vpsrlw %xmm1, %ymm2, %ymm1
 ; X86-AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
@@ -1959,7 +1959,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
@@ -1977,7 +1977,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
 ; XOPAVX2:       # %bb.0:
 ; XOPAVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; XOPAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; XOPAVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
@@ -1986,18 +1986,26 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpsrlw $3, %ymm0, %ymm0
 ; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
 ;
-; AVX512VL-LABEL: splatconstant_shift_v32i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpsrlw $3, %ymm0, %ymm0
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
-; AVX512VL-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT:    retq
+; AVX512DQVL-LABEL: splatconstant_shift_v32i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpsrlw $3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512DQVL-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
+; AVX512DQVL-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: splatconstant_shift_v32i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpsrlw $3, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BWVL-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
+; AVX512BWVL-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    retq
 ;
 ; X86-AVX1-LABEL: splatconstant_shift_v32i8:
 ; X86-AVX1:       # %bb.0:
@@ -2019,7 +2027,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
 ; X86-AVX2:       # %bb.0:
 ; X86-AVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; X86-AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; X86-AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    retl
@@ -2185,7 +2193,7 @@ define <4 x i64> @PR52719(<4 x i64> %a0, i32 %a1) {
 ; X86-AVX2-LABEL: PR52719:
 ; X86-AVX2:       # %bb.0:
 ; X86-AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
 ; X86-AVX2-NEXT:    vpsrlq %xmm1, %ymm2, %ymm2
 ; X86-AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0

diff  --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
index b6ad5306f5d1e..7ea94678e0b8e 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
@@ -197,7 +197,7 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512DQ-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX512DQ-NEXT:    vpbroadcastb %xmm3, %ymm3
 ; AVX512DQ-NEXT:    vpand %ymm3, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX512DQ-NEXT:    vpbroadcastb {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512DQ-NEXT:    vpsrlw %xmm1, %ymm4, %ymm4
 ; AVX512DQ-NEXT:    vpxor %ymm4, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vpsubb %ymm4, %ymm2, %ymm2
@@ -212,7 +212,7 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
 ; AVX512BW-NEXT:    vpsrlw %xmm1, %xmm3, %xmm1
@@ -286,7 +286,7 @@ define <64 x i8> @splatvar_modulo_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwi
 ; AVX512DQ-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX512DQ-NEXT:    vpbroadcastb %xmm3, %ymm3
 ; AVX512DQ-NEXT:    vpand %ymm3, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX512DQ-NEXT:    vpbroadcastb {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512DQ-NEXT:    vpsrlw %xmm1, %ymm4, %ymm4
 ; AVX512DQ-NEXT:    vpxor %ymm4, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vpsubb %ymm4, %ymm2, %ymm2
@@ -301,7 +301,7 @@ define <64 x i8> @splatvar_modulo_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwi
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
 ; AVX512BW-NEXT:    vpsrlw %xmm1, %xmm3, %xmm1
@@ -449,9 +449,9 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512DQ-NEXT:    vpsrlw $3, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512DQ-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
 ; AVX512DQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512DQ-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX512DQ-NEXT:    vpxor %ymm3, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vpsubb %ymm3, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vpsrlw $3, %ymm0, %ymm0
@@ -464,7 +464,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512BW-LABEL: splatconstant_shift_v64i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX512BW-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpsubb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
index d501512201cd1..1fbdc3b45cf35 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
@@ -1356,7 +1356,7 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
 ; AVX2-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
 ; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
@@ -1494,7 +1494,7 @@ define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
 ; AVX2-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
 ; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
@@ -1632,7 +1632,7 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
 ; AVX2-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
 ; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
@@ -1818,7 +1818,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
 ; AVX512BW-LABEL: constant_shift_v4i16:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -1896,7 +1896,7 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
 ; AVX512BW-LABEL: constant_shift_v2i16:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3]
 ; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -2308,14 +2308,23 @@ define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind {
 ; SSE-NEXT:    psubb %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: splatconstant_shift_v8i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm0
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: splatconstant_shift_v8i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_shift_v8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; XOP-LABEL: splatconstant_shift_v8i8:
 ; XOP:       # %bb.0:
@@ -2326,18 +2335,26 @@ define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
-; AVX512VL-LABEL: splatconstant_shift_v8i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpsrlw $3, %xmm0, %xmm0
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
-; AVX512VL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
+; AVX512DQVL-LABEL: splatconstant_shift_v8i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512DQVL-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; AVX512DQVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: splatconstant_shift_v8i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BWVL-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; AVX512BWVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
 ;
 ; X86-SSE-LABEL: splatconstant_shift_v8i8:
 ; X86-SSE:       # %bb.0:
@@ -2361,14 +2378,23 @@ define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind {
 ; SSE-NEXT:    psubb %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: splatconstant_shift_v4i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm0
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: splatconstant_shift_v4i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_shift_v4i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; XOP-LABEL: splatconstant_shift_v4i8:
 ; XOP:       # %bb.0:
@@ -2379,18 +2405,26 @@ define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
-; AVX512VL-LABEL: splatconstant_shift_v4i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpsrlw $3, %xmm0, %xmm0
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
-; AVX512VL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
+; AVX512DQVL-LABEL: splatconstant_shift_v4i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512DQVL-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; AVX512DQVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: splatconstant_shift_v4i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BWVL-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; AVX512BWVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
 ;
 ; X86-SSE-LABEL: splatconstant_shift_v4i8:
 ; X86-SSE:       # %bb.0:
@@ -2414,14 +2448,23 @@ define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind {
 ; SSE-NEXT:    psubb %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: splatconstant_shift_v2i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm0
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: splatconstant_shift_v2i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splatconstant_shift_v2i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; XOP-LABEL: splatconstant_shift_v2i8:
 ; XOP:       # %bb.0:
@@ -2432,18 +2475,26 @@ define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
-; AVX512VL-LABEL: splatconstant_shift_v2i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpsrlw $3, %xmm0, %xmm0
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
-; AVX512VL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
+; AVX512DQVL-LABEL: splatconstant_shift_v2i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512DQVL-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; AVX512DQVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: splatconstant_shift_v2i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BWVL-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; AVX512BWVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
 ;
 ; X86-SSE-LABEL: splatconstant_shift_v2i8:
 ; X86-SSE:       # %bb.0:

diff  --git a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
index f02849d61454a..f647208a8000e 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
@@ -53,18 +53,18 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512DQ-NEXT:    vpsrlw $4, %ymm2, %ymm3
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512DQ-NEXT:    vpand %ymm4, %ymm3, %ymm3
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
 ; AVX512DQ-NEXT:    vpsllw $5, %ymm5, %ymm5
 ; AVX512DQ-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vpsrlw $2, %ymm2, %ymm3
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512DQ-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
 ; AVX512DQ-NEXT:    vpand %ymm6, %ymm3, %ymm3
 ; AVX512DQ-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
 ; AVX512DQ-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vpsrlw $1, %ymm2, %ymm3
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512DQ-NEXT:    vpbroadcastb {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
 ; AVX512DQ-NEXT:    vpand %ymm7, %ymm3, %ymm3
 ; AVX512DQ-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
 ; AVX512DQ-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm2

diff  --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
index 9cab44b069fd4..510ae15ba0960 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
@@ -1511,7 +1511,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
 ; AVX512BW-LABEL: constant_shift_v4i16:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -1581,7 +1581,7 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
 ; AVX512BW-LABEL: constant_shift_v2i16:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3]
 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper

diff  --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
index 07902b4a86dee..dd63565a1dec7 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -1261,7 +1261,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -1291,7 +1291,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512DQ-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512DQ-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512DQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512DQ-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -1310,7 +1310,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX512DQVL:       # %bb.0:
 ; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512DQVL-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512DQVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512DQVL-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512DQVL-NEXT:    vpand %ymm2, %ymm1, %ymm1
 ; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512DQVL-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -1352,7 +1352,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
 ; X86-AVX2:       # %bb.0:
 ; X86-AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; X86-AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1
-; X86-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X86-AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; X86-AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
 ; X86-AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; X86-AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0

diff  --git a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
index 92b60490f976d..8eca56d099feb 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
@@ -53,13 +53,13 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512DQ-NEXT:    vpsllw $4, %ymm2, %ymm3
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512DQ-NEXT:    vpbroadcastb {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
 ; AVX512DQ-NEXT:    vpand %ymm4, %ymm3, %ymm3
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
 ; AVX512DQ-NEXT:    vpsllw $5, %ymm5, %ymm5
 ; AVX512DQ-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vpsllw $2, %ymm2, %ymm3
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512DQ-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
 ; AVX512DQ-NEXT:    vpand %ymm6, %ymm3, %ymm3
 ; AVX512DQ-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
 ; AVX512DQ-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
@@ -301,7 +301,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
 ; AVX512DQ-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512DQ-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512DQ-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512DQ-NEXT:    vpand %ymm4, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
@@ -323,7 +323,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
 ; AVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512BW-NEXT:    vpandq %zmm2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
 ; AVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0

diff  --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
index bd7b250b3d8c5..e7600d272c66f 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
@@ -1339,7 +1339,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
 ; AVX512BW-LABEL: constant_shift_v4i16:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -1399,7 +1399,7 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
 ; AVX512BW-LABEL: constant_shift_v2i16:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3]
 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
index 4e9d17801f5ce..95320ecc340cd 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -613,11 +613,17 @@ define <16 x i8> @shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
-; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX1OR2-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1OR2-NEXT:    retq
+; AVX1-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
 ; AVX512VL:       # %bb.0:
@@ -625,6 +631,18 @@ define <16 x i8> @shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(
 ; AVX512VL-NEXT:    kmovd %eax, %k1
 ; AVX512VL-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
 ; AVX512VL-NEXT:    retq
+;
+; XOPAVX1-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; XOPAVX1-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; XOPAVX2-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
   ret <16 x i8> %shuffle
 }
@@ -653,11 +671,17 @@ define <16 x i8> @shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31(
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
-; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
-; AVX1OR2-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1OR2-NEXT:    retq
+; AVX1-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
+; AVX2-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
 ; AVX512VL:       # %bb.0:
@@ -665,6 +689,18 @@ define <16 x i8> @shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31(
 ; AVX512VL-NEXT:    kmovd %eax, %k1
 ; AVX512VL-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
 ; AVX512VL-NEXT:    retq
+;
+; XOPAVX1-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
+; XOPAVX1-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
+; XOPAVX2-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
   ret <16 x i8> %shuffle
 }
@@ -707,11 +743,17 @@ define <16 x i8> @shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31(
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
-; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
-; AVX1OR2-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1OR2-NEXT:    retq
+; AVX1-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
+; AVX2-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
 ; AVX512VL:       # %bb.0:
@@ -719,6 +761,18 @@ define <16 x i8> @shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31(
 ; AVX512VL-NEXT:    kmovd %eax, %k1
 ; AVX512VL-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
 ; AVX512VL-NEXT:    retq
+;
+; XOPAVX1-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
+; XOPAVX1-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
+; XOPAVX2-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; XOPAVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 11, i32 28, i32 13, i32 14, i32 31>
   ret <16 x i8> %shuffle
 }
@@ -910,7 +964,7 @@ define <16 x i8> @shuffle_v16i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30(
 ;
 ; AVX2-LABEL: shuffle_v16i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
@@ -966,7 +1020,7 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31(
 ;
 ; AVX2-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -974,7 +1028,7 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31(
 ;
 ; AVX512VLBW-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
 ; AVX512VLBW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512VLBW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -1021,11 +1075,17 @@ define <16 x i8> @load_fold_pblendvb(ptr %px, <16 x i8> %y) {
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: load_fold_pblendvb:
-; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
-; AVX1OR2-NEXT:    vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
-; AVX1OR2-NEXT:    retq
+; AVX1-LABEL: load_fold_pblendvb:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; AVX1-NEXT:    vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_fold_pblendvb:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; AVX2-NEXT:    vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512VL-LABEL: load_fold_pblendvb:
 ; AVX512VL:       # %bb.0:
@@ -1033,6 +1093,18 @@ define <16 x i8> @load_fold_pblendvb(ptr %px, <16 x i8> %y) {
 ; AVX512VL-NEXT:    kmovd %eax, %k1
 ; AVX512VL-NEXT:    vmovdqu8 (%rdi), %xmm0 {%k1}
 ; AVX512VL-NEXT:    retq
+;
+; XOPAVX1-LABEL: load_fold_pblendvb:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; XOPAVX1-NEXT:    vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: load_fold_pblendvb:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; XOPAVX2-NEXT:    vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
   %x = load <16 x i8>, ptr %px, align 16
   %select = shufflevector <16 x i8> %x, <16 x i8> %y, <16 x i32> <i32 16, i32 17, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 24, i32 25, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
   ret <16 x i8> %select
@@ -1065,11 +1137,17 @@ define <16 x i8> @load_fold_pblendvb_commute(ptr %px, <16 x i8> %y) {
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: load_fold_pblendvb_commute:
-; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
-; AVX1OR2-NEXT:    vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
-; AVX1OR2-NEXT:    retq
+; AVX1-LABEL: load_fold_pblendvb_commute:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; AVX1-NEXT:    vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_fold_pblendvb_commute:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; AVX2-NEXT:    vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512VL-LABEL: load_fold_pblendvb_commute:
 ; AVX512VL:       # %bb.0:
@@ -1079,6 +1157,18 @@ define <16 x i8> @load_fold_pblendvb_commute(ptr %px, <16 x i8> %y) {
 ; AVX512VL-NEXT:    vmovdqu8 %xmm0, %xmm1 {%k1}
 ; AVX512VL-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512VL-NEXT:    retq
+;
+; XOPAVX1-LABEL: load_fold_pblendvb_commute:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; XOPAVX1-NEXT:    vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: load_fold_pblendvb_commute:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; XOPAVX2-NEXT:    vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
+; XOPAVX2-NEXT:    retq
   %x = load <16 x i8>, ptr %px, align 16
   %select = shufflevector <16 x i8> %y, <16 x i8> %x, <16 x i32> <i32 16, i32 17, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 24, i32 25, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
   ret <16 x i8> %select
@@ -2103,7 +2193,7 @@ define <16 x i8> @PR12412(<16 x i8> %inval1, <16 x i8> %inval2) {
 ;
 ; AVX2-LABEL: PR12412:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
index e858c7cdbfa29..8cfeb2adba5d3 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -6844,7 +6844,7 @@ define <16 x i16> @shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8(<16 x i16> %a,
 ;
 ; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8:
 ; AVX512VL-FAST-CROSSLANE:       # %bb.0:
-; AVX512VL-FAST-CROSSLANE-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-FAST-CROSSLANE-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; AVX512VL-FAST-CROSSLANE-NEXT:    vpermw %ymm0, %ymm1, %ymm0
 ; AVX512VL-FAST-CROSSLANE-NEXT:    retq
 ;

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index 7e7ba8b9ae65b..aebcb68f11c38 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -2219,7 +2219,7 @@ define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_
 ;
 ; AVX2-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
@@ -2237,7 +2237,7 @@ define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_
 ;
 ; XOPAVX2-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; XOPAVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
 ; XOPAVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; XOPAVX2-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 33, i32 2, i32 35, i32 4, i32 37, i32 6, i32 39, i32 8, i32 41, i32 10, i32 43, i32 12, i32 45, i32 14, i32 47, i32 16, i32 49, i32 18, i32 51, i32 20, i32 53, i32 22, i32 55, i32 24, i32 57, i32 26, i32 59, i32 28, i32 61, i32 30, i32 63>
@@ -2255,7 +2255,7 @@ define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_
 ;
 ; AVX2-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
@@ -2273,7 +2273,7 @@ define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_
 ;
 ; XOPAVX2-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; XOPAVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
 ; XOPAVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31>
@@ -2293,7 +2293,7 @@ define <32 x i8> @load_fold_pblendvb(ptr %px, <32 x i8> %y) {
 ;
 ; AVX2-LABEL: load_fold_pblendvb:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
 ; AVX2-NEXT:    vpblendvb %ymm1, (%rdi), %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
@@ -2312,7 +2312,7 @@ define <32 x i8> @load_fold_pblendvb(ptr %px, <32 x i8> %y) {
 ;
 ; XOPAVX2-LABEL: load_fold_pblendvb:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
 ; XOPAVX2-NEXT:    vpblendvb %ymm1, (%rdi), %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
   %x = load <32 x i8>, ptr %px, align 32
@@ -2331,7 +2331,7 @@ define <32 x i8> @load_fold_pblendvb_commute(ptr %px, <32 x i8> %y) {
 ;
 ; AVX2-LABEL: load_fold_pblendvb_commute:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
 ; AVX2-NEXT:    vpblendvb %ymm1, (%rdi), %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
@@ -2353,7 +2353,7 @@ define <32 x i8> @load_fold_pblendvb_commute(ptr %px, <32 x i8> %y) {
 ;
 ; XOPAVX2-LABEL: load_fold_pblendvb_commute:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
 ; XOPAVX2-NEXT:    vpblendvb %ymm1, (%rdi), %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
   %x = load <32 x i8>, ptr %px, align 32
@@ -4517,7 +4517,7 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_
 ;
 ; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
 ; AVX512VLVBMI-FAST-ALL:       # %bb.0:
-; AVX512VLVBMI-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLVBMI-FAST-ALL-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX512VLVBMI-FAST-ALL-NEXT:    vpermb %ymm0, %ymm1, %ymm0
 ; AVX512VLVBMI-FAST-ALL-NEXT:    retq
 ;
@@ -4783,7 +4783,7 @@ define <32 x i8> @shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_
 ;
 ; AVX512VL-LABEL: shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
 ; AVX512VL-NEXT:    retq
 ;

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
index 6200187a14a03..4668d7b6870ef 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
@@ -29,7 +29,7 @@ define <32 x i16> @shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_0
 ;
 ; SKX-LABEL: shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
 ; SKX:       ## %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SKX-NEXT:    vpbroadcastw {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; SKX-NEXT:    vpermw %zmm0, %zmm1, %zmm0
 ; SKX-NEXT:    retq
   %c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
index d44e584599246..843b285ae1c36 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
@@ -163,7 +163,8 @@ define <64 x i8> @shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
 define <64 x i8> @shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00(<64 x i8> %a) {
 ; AVX512F-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; AVX512F-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm1, %ymm0, %ymm2
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; AVX512F-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
@@ -179,7 +180,8 @@ define <64 x i8> @shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_
 ;
 ; AVX512DQ-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; AVX512DQ-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
@@ -463,7 +465,8 @@ define <64 x i8> @shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_
 ; AVX512F-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128]
+; AVX512F-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
@@ -480,7 +483,8 @@ define <64 x i8> @shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_
 ; AVX512DQ-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128]
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128]
+; AVX512DQ-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
@@ -505,7 +509,8 @@ define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_
 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
 ; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
 ; AVX512F-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
@@ -528,7 +533,8 @@ define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
 ; AVX512DQ-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
 ; AVX512DQ-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
+; AVX512DQ-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
index 90b5e70a0a302..8cc20ec3c1a7e 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
@@ -455,7 +455,7 @@ define <32 x i8> @test_mm256_mask_blend_epi8(<32 x i8> %A, <32 x i8> %W){
 ;
 ; AVX512F-LABEL: test_mm256_mask_blend_epi8:
 ; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
 ; AVX512F-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    ret{{[l|q]}}
 entry:
@@ -473,7 +473,7 @@ define <16 x i8> @test_mm_mask_blend_epi8(<16 x i8> %A, <16 x i8> %W){
 ;
 ; AVX512F-LABEL: test_mm_mask_blend_epi8:
 ; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
 ; AVX512F-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    ret{{[l|q]}}
 entry:

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
index b2d813dd440a6..a5ba81d516f72 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
@@ -56,7 +56,8 @@ define <64 x i8> @combine_pshufb_identity_mask(<64 x i8> %x0, i64 %m) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; X86-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X86-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X86-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; X86-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3
 ; X86-NEXT:    vpshufb %zmm2, %zmm0, %zmm3 {%k1}
 ; X86-NEXT:    vpshufb %zmm2, %zmm3, %zmm1 {%k1}
@@ -66,7 +67,8 @@ define <64 x i8> @combine_pshufb_identity_mask(<64 x i8> %x0, i64 %m) {
 ; X64-LABEL: combine_pshufb_identity_mask:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; X64-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X64-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X64-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; X64-NEXT:    kmovq %rdi, %k1
 ; X64-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3
 ; X64-NEXT:    vpshufb %zmm2, %zmm0, %zmm3 {%k1}
@@ -157,14 +159,16 @@ define <64 x i8> @combine_permi2q_pshufb_as_permi2d_mask(<8 x i64> %a0, <8 x i64
 ; X86-LABEL: combine_permi2q_pshufb_as_permi2d_mask:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; X86-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <7,0,u,u,5,0,u,u,u,u,12,0,u,u,14,0>
+; X86-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [7,0,12,0,5,0,14,0,7,0,12,0,5,0,14,0]
+; X86-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
 ; X86-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
 ; X86-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm2[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,20,21,22,23,20,21,22,23,20,21,22,23,20,21,22,23,40,41,42,43,40,41,42,43,40,41,42,43,40,41,42,43,60,61,62,63,60,61,62,63,60,61,62,63,60,61,62,63]
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_permi2q_pshufb_as_permi2d_mask:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <7,u,5,u,u,12,u,14>
+; X64-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [7,12,5,14,7,12,5,14]
+; X64-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
 ; X64-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
 ; X64-NEXT:    kmovq %rdi, %k1
 ; X64-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm2[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,20,21,22,23,20,21,22,23,20,21,22,23,20,21,22,23,40,41,42,43,40,41,42,43,40,41,42,43,40,41,42,43,60,61,62,63,60,61,62,63,60,61,62,63,60,61,62,63]

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index 1b9648e77162e..c6006a9b7493a 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -1749,13 +1749,21 @@ define <4 x i8> @combine_test1c(ptr %a, ptr %b) {
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: combine_test1c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: combine_test1c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_test1c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
+; AVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
   %A = load <4 x i8>, ptr %a
   %B = load <4 x i8>, ptr %b
   %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
@@ -1835,13 +1843,21 @@ define <4 x i8> @combine_test4c(ptr %a, ptr %b) {
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: combine_test4c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: combine_test4c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_test4c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255]
+; AVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
   %A = load <4 x i8>, ptr %a
   %B = load <4 x i8>, ptr %b
   %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
@@ -3326,7 +3342,7 @@ define void @PR45604(ptr %dst, ptr %src) {
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,2,0,2]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u>
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,11,0,0,0,u,u,u,u,11,0,0,0,u,u,u,u,11,0,0,0,u,u,u,u,11,0,0,0>
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll
index f7132b1ea7d23..7159edc2bbdf4 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll
@@ -16,7 +16,8 @@ define <64 x i8> @f1(ptr %p0) {
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = <1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u>
 ; AVX2-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
 ; AVX2-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,1,5,7,11,13,1,3,7,9,13,15,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13,1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
 ; AVX2-NEXT:    vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
@@ -106,7 +107,8 @@ define <64 x i8> @f1(ptr %p0) {
 ; AVX512BW-NEXT:    vpor %xmm2, %xmm5, %xmm2
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,1,5,7,11,13,1,3,7,9,13,15,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13,1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13]
+; AVX512BW-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
 ; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
@@ -158,7 +160,8 @@ define <64 x i8> @f2(ptr %p0) {
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = <128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u>
 ; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
 ; AVX2-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,1,5,7,11,13,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [1,5,7,11,13,0,0,0,0,0,0,3,5,9,11,15,1,5,7,11,13,0,0,0,0,0,0,3,5,9,11,15]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
 ; AVX2-NEXT:    vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
@@ -306,7 +309,8 @@ define <64 x i8> @f3(ptr %p0) {
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm8 = <2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u>
 ; AVX2-NEXT:    vpshufb %xmm8, %xmm4, %xmm4
 ; AVX2-NEXT:    vpor %xmm6, %xmm4, %xmm4
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,2,4,8,10,14,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14,2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14]
+; AVX2-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm0[5,6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
@@ -340,7 +344,8 @@ define <64 x i8> @f3(ptr %p0) {
 ; AVX512F-NEXT:    vpor %xmm0, %xmm2, %xmm0
 ; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,2,4,8,10,14,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14,2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-NEXT:    vpternlogq $216, %ymm5, %ymm2, %ymm0
@@ -390,7 +395,8 @@ define <64 x i8> @f3(ptr %p0) {
 ; AVX512BW-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpor %xmm4, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm4
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,2,4,8,10,14,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14,2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14]
+; AVX512BW-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
 ; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
@@ -441,7 +447,8 @@ define <64 x i8> @f4(ptr %p0) {
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = <128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u>
 ; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
 ; AVX2-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,0,4,6,10,12,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,4,6,10,12,0,0,0,0,0,0,2,4,8,10,14,0,4,6,10,12,0,0,0,0,0,0,2,4,8,10,14]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
 ; AVX2-NEXT:    vpblendvb %ymm6, %ymm2, %ymm0, %ymm0

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-v48.ll b/llvm/test/CodeGen/X86/vector-shuffle-v48.ll
index 5f4572b8c3d88..ed9f849d35d00 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-v48.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-v48.ll
@@ -38,7 +38,8 @@ define <32 x i8> @foo(ptr %x0) {
 ; AVX2-NEXT:    vmovdqu 16(%rdi), %xmm2
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,0,2,3,5,6]
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,3,4,6,7,9,10,12,13,15,u,u,u,u,u,24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,1,2,4,5,7,8,10,11,13,14]
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0

diff  --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll
index 37b996bfe686a..6c57956b3e29c 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-math.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll
@@ -43,7 +43,8 @@ define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; AVX2-FAST-ALL-LABEL: trunc_add_v4i64_v4i32:
 ; AVX2-FAST-ALL:       # %bb.0:
 ; AVX2-FAST-ALL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX2-FAST-ALL-NEXT:    vzeroupper
@@ -483,7 +484,8 @@ define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
 ;
 ; AVX2-FAST-ALL-LABEL: trunc_add_const_v4i64_v4i32:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-ALL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX2-FAST-ALL-NEXT:    vzeroupper
@@ -821,7 +823,8 @@ define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; AVX2-FAST-ALL-LABEL: trunc_sub_v4i64_v4i32:
 ; AVX2-FAST-ALL:       # %bb.0:
 ; AVX2-FAST-ALL-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX2-FAST-ALL-NEXT:    vzeroupper
@@ -1231,7 +1234,8 @@ define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
 ;
 ; AVX2-FAST-ALL-LABEL: trunc_sub_const_v4i64_v4i32:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-ALL-NEXT:    vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX2-FAST-ALL-NEXT:    vzeroupper
@@ -2157,7 +2161,8 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
 ;
 ; AVX2-FAST-ALL-LABEL: trunc_mul_const_v4i64_v4i32:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-ALL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX2-FAST-ALL-NEXT:    vzeroupper

diff  --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
index 4680e86cf73ad..6ecc3980329a5 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
@@ -107,16 +107,27 @@ define <2 x i32> @trunc_packus_v2i64_v2i32(<2 x i64> %a0) {
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: trunc_packus_v2i64_v2i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295]
-; AVX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vpand %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX-NEXT:    retq
+; AVX1-LABEL: trunc_packus_v2i64_v2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295]
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_packus_v2i64_v2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4294967295,4294967295]
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_packus_v2i64_v2i32:
 ; AVX512F:       # %bb.0:
@@ -257,17 +268,29 @@ define void @trunc_packus_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) {
 ; SSE41-NEXT:    movq %xmm0, (%rdi)
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: trunc_packus_v2i64_v2i32_store:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295]
-; AVX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vpand %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX-NEXT:    vmovq %xmm0, (%rdi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: trunc_packus_v2i64_v2i32_store:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295]
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT:    vmovq %xmm0, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_packus_v2i64_v2i32_store:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4294967295,4294967295]
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT:    vmovq %xmm0, (%rdi)
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_packus_v2i64_v2i32_store:
 ; AVX512F:       # %bb.0:
@@ -514,7 +537,8 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) {
 ; AVX2-FAST-ALL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-FAST-ALL-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm1
 ; AVX2-FAST-ALL-NEXT:    vpand %ymm0, %ymm1, %ymm0
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX2-FAST-ALL-NEXT:    vzeroupper
@@ -1115,7 +1139,7 @@ define <2 x i16> @trunc_packus_v2i64_v2i16(<2 x i64> %a0) {
 ;
 ; AVX2-SLOW-LABEL: trunc_packus_v2i64_v2i16:
 ; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,65535]
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [65535,65535]
 ; AVX2-SLOW-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
 ; AVX2-SLOW-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX2-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
@@ -1127,7 +1151,7 @@ define <2 x i16> @trunc_packus_v2i64_v2i16(<2 x i64> %a0) {
 ;
 ; AVX2-FAST-LABEL: trunc_packus_v2i64_v2i16:
 ; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,65535]
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [65535,65535]
 ; AVX2-FAST-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
 ; AVX2-FAST-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX2-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
@@ -1293,7 +1317,7 @@ define void @trunc_packus_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) {
 ;
 ; AVX2-SLOW-LABEL: trunc_packus_v2i64_v2i16_store:
 ; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,65535]
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [65535,65535]
 ; AVX2-SLOW-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
 ; AVX2-SLOW-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX2-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
@@ -1306,7 +1330,7 @@ define void @trunc_packus_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) {
 ;
 ; AVX2-FAST-LABEL: trunc_packus_v2i64_v2i16_store:
 ; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,65535]
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [65535,65535]
 ; AVX2-FAST-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
 ; AVX2-FAST-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX2-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
@@ -2801,16 +2825,27 @@ define <2 x i8> @trunc_packus_v2i64_v2i8(<2 x i64> %a0) {
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: trunc_packus_v2i64_v2i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255]
-; AVX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vpand %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    retq
+; AVX1-LABEL: trunc_packus_v2i64_v2i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255]
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_packus_v2i64_v2i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [255,255]
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_packus_v2i64_v2i8:
 ; AVX512F:       # %bb.0:
@@ -2955,17 +2990,29 @@ define void @trunc_packus_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) {
 ; SSE41-NEXT:    pextrw $0, %xmm1, (%rdi)
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: trunc_packus_v2i64_v2i8_store:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255]
-; AVX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vpand %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: trunc_packus_v2i64_v2i8_store:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255]
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_packus_v2i64_v2i8_store:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [255,255]
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_packus_v2i64_v2i8_store:
 ; AVX512F:       # %bb.0:
@@ -3216,7 +3263,7 @@ define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) {
 ; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -3475,7 +3522,7 @@ define void @trunc_packus_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
 ; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]

diff  --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
index f6e4377f64fa7..c378281ac009e 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
@@ -111,16 +111,27 @@ define <2 x i32> @trunc_ssat_v2i64_v2i32(<2 x i64> %a0) {
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: trunc_ssat_v2i64_v2i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [2147483647,2147483647]
-; AVX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
-; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX-NEXT:    retq
+; AVX1-LABEL: trunc_ssat_v2i64_v2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [2147483647,2147483647]
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_ssat_v2i64_v2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2147483647,2147483647]
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_ssat_v2i64_v2i32:
 ; AVX512F:       # %bb.0:
@@ -255,17 +266,29 @@ define void @trunc_ssat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) {
 ; SSE41-NEXT:    movq %xmm0, (%rdi)
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: trunc_ssat_v2i64_v2i32_store:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [2147483647,2147483647]
-; AVX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
-; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX-NEXT:    vmovlpd %xmm0, (%rdi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: trunc_ssat_v2i64_v2i32_store:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [2147483647,2147483647]
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT:    vmovlpd %xmm0, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_ssat_v2i64_v2i32_store:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2147483647,2147483647]
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT:    vmovlpd %xmm0, (%rdi)
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_ssat_v2i64_v2i32_store:
 ; AVX512F:       # %bb.0:
@@ -1126,10 +1149,10 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) {
 ;
 ; AVX2-SLOW-LABEL: trunc_ssat_v2i64_v2i16:
 ; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767]
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [32767,32767]
 ; AVX2-SLOW-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
 ; AVX2-SLOW-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
 ; AVX2-SLOW-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
 ; AVX2-SLOW-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1138,10 +1161,10 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) {
 ;
 ; AVX2-FAST-LABEL: trunc_ssat_v2i64_v2i16:
 ; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767]
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [32767,32767]
 ; AVX2-FAST-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
 ; AVX2-FAST-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
 ; AVX2-FAST-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
 ; AVX2-FAST-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
@@ -1296,10 +1319,10 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) {
 ;
 ; AVX2-SLOW-LABEL: trunc_ssat_v2i64_v2i16_store:
 ; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767]
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [32767,32767]
 ; AVX2-SLOW-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
 ; AVX2-SLOW-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
 ; AVX2-SLOW-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
 ; AVX2-SLOW-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1309,10 +1332,10 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) {
 ;
 ; AVX2-FAST-LABEL: trunc_ssat_v2i64_v2i16_store:
 ; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [32767,32767]
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [32767,32767]
 ; AVX2-FAST-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
 ; AVX2-FAST-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
 ; AVX2-FAST-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
 ; AVX2-FAST-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u]
@@ -2539,16 +2562,27 @@ define <2 x i8> @trunc_ssat_v2i64_v2i8(<2 x i64> %a0) {
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: trunc_ssat_v2i64_v2i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127]
-; AVX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
-; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    retq
+; AVX1-LABEL: trunc_ssat_v2i64_v2i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127]
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_ssat_v2i64_v2i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [127,127]
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_ssat_v2i64_v2i8:
 ; AVX512F:       # %bb.0:
@@ -2686,17 +2720,29 @@ define void @trunc_ssat_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) {
 ; SSE41-NEXT:    pextrw $0, %xmm1, (%rdi)
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: trunc_ssat_v2i64_v2i8_store:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127]
-; AVX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
-; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: trunc_ssat_v2i64_v2i8_store:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127]
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_ssat_v2i64_v2i8_store:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [127,127]
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_ssat_v2i64_v2i8_store:
 ; AVX512F:       # %bb.0:
@@ -2948,7 +2994,7 @@ define <4 x i8> @trunc_ssat_v4i64_v4i8(<4 x i64> %a0) {
 ; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
 ; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -3208,7 +3254,7 @@ define void @trunc_ssat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
 ; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
 ; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]

diff  --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
index f687374baea4b..a830a96cdcb22 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
@@ -67,16 +67,27 @@ define <2 x i32> @trunc_usat_v2i64_v2i32(<2 x i64> %a0) {
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: trunc_usat_v2i64_v2i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [4294967295,4294967295]
-; AVX-NEXT:    # xmm1 = mem[0,0]
-; AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103]
-; AVX-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX-NEXT:    retq
+; AVX1-LABEL: trunc_usat_v2i64_v2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [4294967295,4294967295]
+; AVX1-NEXT:    # xmm1 = mem[0,0]
+; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103]
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_usat_v2i64_v2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovddup {{.*#+}} xmm1 = [4294967295,4294967295]
+; AVX2-NEXT:    # xmm1 = mem[0,0]
+; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103]
+; AVX2-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_usat_v2i64_v2i32:
 ; AVX512F:       # %bb.0:
@@ -165,17 +176,29 @@ define void @trunc_usat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) {
 ; SSE41-NEXT:    movq %xmm0, (%rdi)
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: trunc_usat_v2i64_v2i32_store:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [4294967295,4294967295]
-; AVX-NEXT:    # xmm1 = mem[0,0]
-; AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103]
-; AVX-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX-NEXT:    vmovlpd %xmm0, (%rdi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: trunc_usat_v2i64_v2i32_store:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [4294967295,4294967295]
+; AVX1-NEXT:    # xmm1 = mem[0,0]
+; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103]
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT:    vmovlpd %xmm0, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_usat_v2i64_v2i32_store:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovddup {{.*#+}} xmm1 = [4294967295,4294967295]
+; AVX2-NEXT:    # xmm1 = mem[0,0]
+; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103]
+; AVX2-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT:    vmovlpd %xmm0, (%rdi)
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_usat_v2i64_v2i32_store:
 ; AVX512F:       # %bb.0:
@@ -748,7 +771,7 @@ define <2 x i16> @trunc_usat_v2i64_v2i16(<2 x i64> %a0) {
 ; AVX2-SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = [65535,65535]
 ; AVX2-SLOW-NEXT:    # xmm1 = mem[0,0]
 ; AVX2-SLOW-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
 ; AVX2-SLOW-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
 ; AVX2-SLOW-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -760,7 +783,7 @@ define <2 x i16> @trunc_usat_v2i64_v2i16(<2 x i64> %a0) {
 ; AVX2-FAST-NEXT:    vmovddup {{.*#+}} xmm1 = [65535,65535]
 ; AVX2-FAST-NEXT:    # xmm1 = mem[0,0]
 ; AVX2-FAST-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
 ; AVX2-FAST-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
 ; AVX2-FAST-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
@@ -872,7 +895,7 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) {
 ; AVX2-SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = [65535,65535]
 ; AVX2-SLOW-NEXT:    # xmm1 = mem[0,0]
 ; AVX2-SLOW-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
+; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
 ; AVX2-SLOW-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
 ; AVX2-SLOW-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -885,7 +908,7 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) {
 ; AVX2-FAST-NEXT:    vmovddup {{.*#+}} xmm1 = [65535,65535]
 ; AVX2-FAST-NEXT:    # xmm1 = mem[0,0]
 ; AVX2-FAST-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
+; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
 ; AVX2-FAST-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
 ; AVX2-FAST-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u]
@@ -2005,16 +2028,27 @@ define <2 x i8> @trunc_usat_v2i64_v2i8(<2 x i64> %a0) {
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: trunc_usat_v2i64_v2i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [255,255]
-; AVX-NEXT:    # xmm1 = mem[0,0]
-; AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063]
-; AVX-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    retq
+; AVX1-LABEL: trunc_usat_v2i64_v2i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [255,255]
+; AVX1-NEXT:    # xmm1 = mem[0,0]
+; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063]
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_usat_v2i64_v2i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovddup {{.*#+}} xmm1 = [255,255]
+; AVX2-NEXT:    # xmm1 = mem[0,0]
+; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063]
+; AVX2-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_usat_v2i64_v2i8:
 ; AVX512F:       # %bb.0:
@@ -2106,17 +2140,29 @@ define void @trunc_usat_v2i64_v2i8_store(<2 x i64> %a0, ptr %p1) {
 ; SSE41-NEXT:    pextrw $0, %xmm2, (%rdi)
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: trunc_usat_v2i64_v2i8_store:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [255,255]
-; AVX-NEXT:    # xmm1 = mem[0,0]
-; AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063]
-; AVX-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: trunc_usat_v2i64_v2i8_store:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [255,255]
+; AVX1-NEXT:    # xmm1 = mem[0,0]
+; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063]
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc_usat_v2i64_v2i8_store:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovddup {{.*#+}} xmm1 = [255,255]
+; AVX2-NEXT:    # xmm1 = mem[0,0]
+; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063]
+; AVX2-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_usat_v2i64_v2i8_store:
 ; AVX512F:       # %bb.0:
@@ -2280,7 +2326,7 @@ define <4 x i8> @trunc_usat_v4i64_v4i8(<4 x i64> %a0) {
 ; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -2452,7 +2498,7 @@ define void @trunc_usat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
 ; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -4408,7 +4454,7 @@ define <32 x i8> @trunc_usat_v32i16_v32i8(ptr %p0) {
 ;
 ; AVX2-LABEL: trunc_usat_v32i16_v32i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpminuw 32(%rdi), %ymm0, %ymm1
 ; AVX2-NEXT:    vpminuw (%rdi), %ymm0, %ymm0
 ; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
@@ -4417,7 +4463,7 @@ define <32 x i8> @trunc_usat_v32i16_v32i8(ptr %p0) {
 ;
 ; AVX512F-LABEL: trunc_usat_v32i16_v32i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512F-NEXT:    vpminuw 32(%rdi), %ymm0, %ymm1
 ; AVX512F-NEXT:    vpminuw (%rdi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
@@ -4429,7 +4475,7 @@ define <32 x i8> @trunc_usat_v32i16_v32i8(ptr %p0) {
 ;
 ; AVX512VL-LABEL: trunc_usat_v32i16_v32i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512VL-NEXT:    vpminuw 32(%rdi), %ymm0, %ymm1
 ; AVX512VL-NEXT:    vpminuw (%rdi), %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero

diff  --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll
index b5fa7312f7121..8c85c82b52ec6 100644
--- a/llvm/test/CodeGen/X86/vector-trunc.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc.ll
@@ -582,7 +582,7 @@ define void @trunc8i32_8i8(<8 x i32> %a) {
 ; AVX2-LABEL: trunc8i32_8i8:
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -1815,17 +1815,25 @@ define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) {
 ; SSE41-NEXT:    packuswb %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: trunc2x8i16_16i8:
-; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: trunc2x8i16_16i8:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc2x8i16_16i8:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc2x8i16_16i8:
 ; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
 ; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
@@ -1833,7 +1841,7 @@ define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) {
 ;
 ; AVX512VL-LABEL: trunc2x8i16_16i8:
 ; AVX512VL:       # %bb.0: # %entry
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
 ; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0

diff  --git a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
index caeb0015d4b52..0a19c166f936b 100644
--- a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
+++ b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
@@ -122,7 +122,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
 ; AVX2-NEXT:    vpandn %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -150,7 +150,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ; AVX512CD-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
 ; AVX512CD-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm1 = [64,64]
+; AVX512CD-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [64,64]
 ; AVX512CD-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
 ; AVX512CD-NEXT:    vzeroupper
 ; AVX512CD-NEXT:    retq
@@ -323,7 +323,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
 ; AVX2-NEXT:    vpandn %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -351,7 +351,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ; AVX512CD-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
 ; AVX512CD-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm1 = [64,64]
+; AVX512CD-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [64,64]
 ; AVX512CD-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
 ; AVX512CD-NEXT:    vzeroupper
 ; AVX512CD-NEXT:    retq
@@ -548,7 +548,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; AVX2-NEXT:    vpandn %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -789,7 +789,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; AVX2-NEXT:    vpandn %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
@@ -992,23 +992,77 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ; SSE41-NEXT:    psrlw $8, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: testv8i16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpsllw $8, %xmm0, %xmm1
-; AVX-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: testv8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: testv8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsllw $8, %xmm0, %xmm1
+; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512CDVL-LABEL: testv8i16:
+; AVX512CDVL:       # %bb.0:
+; AVX512CDVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CDVL-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX512CDVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CDVL-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CDVL-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CDVL-NEXT:    vpsllw $8, %xmm0, %xmm1
+; AVX512CDVL-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX512CDVL-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX512CDVL-NEXT:    retq
+;
+; AVX512CD-LABEL: testv8i16:
+; AVX512CD:       # %bb.0:
+; AVX512CD-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CD-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX512CD-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CD-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CD-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CD-NEXT:    vpsllw $8, %xmm0, %xmm1
+; AVX512CD-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX512CD-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX512CD-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv8i16:
 ; AVX512VPOPCNTDQ:       # %bb.0:
@@ -1168,23 +1222,77 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ; SSE41-NEXT:    psrlw $8, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: testv8i16u:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpsllw $8, %xmm0, %xmm1
-; AVX-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: testv8i16u:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: testv8i16u:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsllw $8, %xmm0, %xmm1
+; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512CDVL-LABEL: testv8i16u:
+; AVX512CDVL:       # %bb.0:
+; AVX512CDVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CDVL-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX512CDVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CDVL-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CDVL-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CDVL-NEXT:    vpsllw $8, %xmm0, %xmm1
+; AVX512CDVL-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX512CDVL-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX512CDVL-NEXT:    retq
+;
+; AVX512CD-LABEL: testv8i16u:
+; AVX512CD:       # %bb.0:
+; AVX512CD-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CD-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX512CD-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CD-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CD-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CD-NEXT:    vpsllw $8, %xmm0, %xmm1
+; AVX512CD-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX512CD-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX512CD-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv8i16u:
 ; AVX512VPOPCNTDQ:       # %bb.0:
@@ -1330,20 +1438,65 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: testv16i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: testv16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: testv16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512CDVL-LABEL: testv16i8:
+; AVX512CDVL:       # %bb.0:
+; AVX512CDVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CDVL-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
+; AVX512CDVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CDVL-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CDVL-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CDVL-NEXT:    retq
+;
+; AVX512CD-LABEL: testv16i8:
+; AVX512CD:       # %bb.0:
+; AVX512CD-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CD-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
+; AVX512CD-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CD-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CD-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CD-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv16i8:
 ; AVX512VPOPCNTDQ:       # %bb.0:
@@ -1485,20 +1638,65 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: testv16i8u:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: testv16i8u:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: testv16i8u:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512CDVL-LABEL: testv16i8u:
+; AVX512CDVL:       # %bb.0:
+; AVX512CDVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CDVL-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
+; AVX512CDVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CDVL-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CDVL-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CDVL-NEXT:    retq
+;
+; AVX512CD-LABEL: testv16i8u:
+; AVX512CD:       # %bb.0:
+; AVX512CD-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CD-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
+; AVX512CD-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CD-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CD-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CD-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv16i8u:
 ; AVX512VPOPCNTDQ:       # %bb.0:

diff  --git a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll
index f97223b79cb0c..f2c84713d47ef 100644
--- a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll
+++ b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll
@@ -45,9 +45,10 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -119,9 +120,10 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ; X32-AVX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; X32-AVX-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
 ; X32-AVX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-AVX-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT:    # ymm3 = mem[0,1,0,1]
 ; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -168,9 +170,10 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -242,9 +245,10 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ; X32-AVX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; X32-AVX-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
 ; X32-AVX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-AVX-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT:    # ymm3 = mem[0,1,0,1]
 ; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -299,9 +303,10 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -385,9 +390,10 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ; X32-AVX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; X32-AVX-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
 ; X32-AVX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-AVX-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT:    # ymm3 = mem[0,1,0,1]
 ; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -446,9 +452,10 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -532,9 +539,10 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ; X32-AVX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; X32-AVX-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
 ; X32-AVX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-AVX-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT:    # ymm3 = mem[0,1,0,1]
 ; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -588,9 +596,10 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -606,9 +615,10 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ; AVX512CDVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; AVX512CDVL-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
 ; AVX512CDVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512CDVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512CDVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -624,9 +634,10 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ; AVX512CD-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; AVX512CD-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
 ; AVX512CD-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512CD-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -679,9 +690,10 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ; X32-AVX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; X32-AVX-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
 ; X32-AVX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-AVX-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT:    # ymm3 = mem[0,1,0,1]
 ; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -732,9 +744,10 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -750,9 +763,10 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ; AVX512CDVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; AVX512CDVL-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
 ; AVX512CDVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512CDVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512CDVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -768,9 +782,10 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ; AVX512CD-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; AVX512CD-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
 ; AVX512CD-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512CD-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -823,9 +838,10 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ; X32-AVX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; X32-AVX-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
 ; X32-AVX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-AVX-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT:    # ymm3 = mem[0,1,0,1]
 ; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -870,9 +886,10 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -885,9 +902,10 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ; AVX512CDVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; AVX512CDVL-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
 ; AVX512CDVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512CDVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512CDVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -900,9 +918,10 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ; AVX512CD-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; AVX512CD-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
 ; AVX512CD-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512CD-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -915,9 +934,10 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
 ; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -930,9 +950,10 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; AVX512VPOPCNTDQVL-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
 ; AVX512VPOPCNTDQVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -962,9 +983,10 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ; X32-AVX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; X32-AVX-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
 ; X32-AVX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-AVX-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT:    # ymm3 = mem[0,1,0,1]
 ; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -1006,9 +1028,10 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -1021,9 +1044,10 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ; AVX512CDVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; AVX512CDVL-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
 ; AVX512CDVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512CDVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512CDVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -1036,9 +1060,10 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ; AVX512CD-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; AVX512CD-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
 ; AVX512CD-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512CD-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -1051,9 +1076,10 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
 ; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -1066,9 +1092,10 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; AVX512VPOPCNTDQVL-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
 ; AVX512VPOPCNTDQVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX512VPOPCNTDQVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
@@ -1098,9 +1125,10 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ; X32-AVX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; X32-AVX-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
 ; X32-AVX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-AVX-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT:    # ymm3 = mem[0,1,0,1]
 ; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0

diff  --git a/llvm/test/CodeGen/X86/vector-tzcnt-512.ll b/llvm/test/CodeGen/X86/vector-tzcnt-512.ll
index cb64483731433..368fcd3e0e9a1 100644
--- a/llvm/test/CodeGen/X86/vector-tzcnt-512.ll
+++ b/llvm/test/CodeGen/X86/vector-tzcnt-512.ll
@@ -31,9 +31,10 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
 ; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -90,9 +91,10 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
 ; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -149,9 +151,10 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 ; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpandnd %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -216,9 +219,10 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
 ; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpandnd %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -263,9 +267,10 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
 ; AVX512CD-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; AVX512CD-NEXT:    vpaddw %ymm1, %ymm0, %ymm2
 ; AVX512CD-NEXT:    vpandn %ymm2, %ymm0, %ymm2
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CD-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512CD-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
 ; AVX512CD-NEXT:    vpsrlw $4, %ymm2, %ymm2
 ; AVX512CD-NEXT:    vpand %ymm3, %ymm2, %ymm2
@@ -294,9 +299,10 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
 ; AVX512CDBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
 ; AVX512CDBW-NEXT:    vpaddw %zmm1, %zmm0, %zmm1
 ; AVX512CDBW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDBW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512CDBW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512CDBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -312,9 +318,10 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
 ; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -359,9 +366,10 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
 ; AVX512CD-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; AVX512CD-NEXT:    vpaddw %ymm1, %ymm0, %ymm2
 ; AVX512CD-NEXT:    vpandn %ymm2, %ymm0, %ymm2
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CD-NEXT:    vpand %ymm3, %ymm2, %ymm4
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512CD-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
 ; AVX512CD-NEXT:    vpsrlw $4, %ymm2, %ymm2
 ; AVX512CD-NEXT:    vpand %ymm3, %ymm2, %ymm2
@@ -390,9 +398,10 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
 ; AVX512CDBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
 ; AVX512CDBW-NEXT:    vpaddw %zmm1, %zmm0, %zmm1
 ; AVX512CDBW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDBW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512CDBW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512CDBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -408,9 +417,10 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
 ; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -456,9 +466,10 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
 ; AVX512CD-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 ; AVX512CD-NEXT:    vpaddb %ymm2, %ymm1, %ymm3
 ; AVX512CD-NEXT:    vpandn %ymm3, %ymm1, %ymm1
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CD-NEXT:    vpand %ymm3, %ymm1, %ymm4
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512CD-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
 ; AVX512CD-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512CD-NEXT:    vpand %ymm3, %ymm1, %ymm1
@@ -480,9 +491,10 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
 ; AVX512CDBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
 ; AVX512CDBW-NEXT:    vpaddb %zmm1, %zmm0, %zmm1
 ; AVX512CDBW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDBW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512CDBW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512CDBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -495,9 +507,10 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
 ; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -511,9 +524,10 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 ; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm1, %ymm3
 ; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm3, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm1, %ymm4
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
 ; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm1, %ymm1
@@ -548,9 +562,10 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
 ; AVX512CD-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 ; AVX512CD-NEXT:    vpaddb %ymm2, %ymm1, %ymm3
 ; AVX512CD-NEXT:    vpandn %ymm3, %ymm1, %ymm1
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CD-NEXT:    vpand %ymm3, %ymm1, %ymm4
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512CD-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
 ; AVX512CD-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512CD-NEXT:    vpand %ymm3, %ymm1, %ymm1
@@ -572,9 +587,10 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
 ; AVX512CDBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
 ; AVX512CDBW-NEXT:    vpaddb %zmm1, %zmm0, %zmm1
 ; AVX512CDBW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDBW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512CDBW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512CDBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -587,9 +603,10 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
 ; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
@@ -603,9 +620,10 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 ; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm1, %ymm3
 ; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm3, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm1, %ymm4
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
 ; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm1, %ymm1

diff  --git a/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll b/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll
index 24f40b6fdf1be..650ee0e7e3f1d 100644
--- a/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll
+++ b/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll
@@ -353,15 +353,25 @@ define <16 x i1> @ugt_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; SSE-NEXT:    pcmpgtb %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: ugt_v16i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpsrlw $1, %xmm1, %xmm1
-; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: ugt_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: ugt_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
   %sh1 = lshr <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   %sh2 = lshr <16 x i8> %y, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   %cmp = icmp ugt <16 x i8> %sh1, %sh2
@@ -380,15 +390,25 @@ define <16 x i1> @ult_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; SSE-NEXT:    movdqa %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: ult_v16i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpsrlw $1, %xmm1, %xmm1
-; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: ult_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: ult_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
   %sh1 = lshr <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   %sh2 = lshr <16 x i8> %y, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   %cmp = icmp ult <16 x i8> %sh1, %sh2
@@ -407,16 +427,27 @@ define <16 x i1> @uge_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; SSE-NEXT:    pcmpeqb %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: uge_v16i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpsrlw $1, %xmm1, %xmm1
-; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: uge_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uge_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpmaxub %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
   %sh1 = lshr <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   %sh2 = lshr <16 x i8> %y, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   %cmp = icmp uge <16 x i8> %sh1, %sh2
@@ -435,16 +466,27 @@ define <16 x i1> @ule_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; SSE-NEXT:    pcmpeqb %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: ule_v16i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpsrlw $1, %xmm1, %xmm1
-; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: ule_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: ule_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
   %sh1 = lshr <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   %sh2 = lshr <16 x i8> %y, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   %cmp = icmp ule <16 x i8> %sh1, %sh2

diff  --git a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
index 1781196fc6f64..d180bbe9d53e0 100644
--- a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
+++ b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
@@ -447,14 +447,23 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_1(<16 x i8> %a0) {
 ; X86-SSE2-NEXT:    psubb %xmm1, %xmm0
 ; X86-SSE2-NEXT:    retl
 ;
-; X86-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_1:
-; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vpsrlw $1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
-; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    retl
+; X86-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_1:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_1:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: test_128_i8_x_16_224_mask_ashr_1:
 ; X64-SSE2:       # %bb.0:
@@ -465,14 +474,23 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_1(<16 x i8> %a0) {
 ; X64-SSE2-NEXT:    psubb %xmm1, %xmm0
 ; X64-SSE2-NEXT:    retq
 ;
-; X64-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_1:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vpsrlw $1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
-; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    retq
+; X64-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_1:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_1:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    retq
   %t0 = and <16 x i8> %a0, <i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224>
   %t1 = ashr <16 x i8> %t0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   ret <16 x i8> %t1
@@ -487,14 +505,23 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_4(<16 x i8> %a0) {
 ; X86-SSE2-NEXT:    psubb %xmm1, %xmm0
 ; X86-SSE2-NEXT:    retl
 ;
-; X86-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_4:
-; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    retl
+; X86-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_4:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_4:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: test_128_i8_x_16_224_mask_ashr_4:
 ; X64-SSE2:       # %bb.0:
@@ -505,14 +532,23 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_4(<16 x i8> %a0) {
 ; X64-SSE2-NEXT:    psubb %xmm1, %xmm0
 ; X64-SSE2-NEXT:    retq
 ;
-; X64-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_4:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    retq
+; X64-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_4:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_4:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    retq
   %t0 = and <16 x i8> %a0, <i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224>
   %t1 = ashr <16 x i8> %t0, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
   ret <16 x i8> %t1
@@ -527,14 +563,23 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_5(<16 x i8> %a0) {
 ; X86-SSE2-NEXT:    psubb %xmm1, %xmm0
 ; X86-SSE2-NEXT:    retl
 ;
-; X86-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_5:
-; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vpsrlw $5, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
-; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    retl
+; X86-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_5:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vpsrlw $5, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_5:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    vpsrlw $5, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: test_128_i8_x_16_224_mask_ashr_5:
 ; X64-SSE2:       # %bb.0:
@@ -545,14 +590,23 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_5(<16 x i8> %a0) {
 ; X64-SSE2-NEXT:    psubb %xmm1, %xmm0
 ; X64-SSE2-NEXT:    retq
 ;
-; X64-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_5:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vpsrlw $5, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
-; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    retq
+; X64-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_5:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vpsrlw $5, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_5:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpsrlw $5, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    retq
   %t0 = and <16 x i8> %a0, <i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224>
   %t1 = ashr <16 x i8> %t0, <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
   ret <16 x i8> %t1
@@ -567,14 +621,23 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_6(<16 x i8> %a0) {
 ; X86-SSE2-NEXT:    psubb %xmm1, %xmm0
 ; X86-SSE2-NEXT:    retl
 ;
-; X86-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_6:
-; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vpsrlw $6, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
-; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    retl
+; X86-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_6:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vpsrlw $6, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_6:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    vpsrlw $6, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: test_128_i8_x_16_224_mask_ashr_6:
 ; X64-SSE2:       # %bb.0:
@@ -585,14 +648,23 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_6(<16 x i8> %a0) {
 ; X64-SSE2-NEXT:    psubb %xmm1, %xmm0
 ; X64-SSE2-NEXT:    retq
 ;
-; X64-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_6:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vpsrlw $6, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
-; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    retq
+; X64-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_6:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vpsrlw $6, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_6:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpsrlw $6, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    retq
   %t0 = and <16 x i8> %a0, <i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224>
   %t1 = ashr <16 x i8> %t0, <i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6>
   ret <16 x i8> %t1

diff  --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll
index 367e0993e76ba..6d42147bd7c45 100644
--- a/llvm/test/CodeGen/X86/vselect-avx.ll
+++ b/llvm/test/CodeGen/X86/vselect-avx.ll
@@ -182,7 +182,7 @@ define <32 x i8> @PR22706(<32 x i1> %x) {
 ; AVX512-LABEL: PR22706:
 ; AVX512:       ## %bb.0:
 ; AVX512-NEXT:    vpsllw $7, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
 ; AVX512-NEXT:    vpblendvb %ymm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
 ; AVX512-NEXT:    retq
   %tmp = select <32 x i1> %x, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <32 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>

diff  --git a/llvm/test/CodeGen/X86/vselect-minmax.ll b/llvm/test/CodeGen/X86/vselect-minmax.ll
index 7a3e9af985497..3a813ccd41fdc 100644
--- a/llvm/test/CodeGen/X86/vselect-minmax.ll
+++ b/llvm/test/CodeGen/X86/vselect-minmax.ll
@@ -9521,7 +9521,7 @@ define <2 x i64> @test181(<2 x i64> %a, <2 x i64> %b) {
 ;
 ; AVX2-LABEL: test181:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -9589,7 +9589,7 @@ define <2 x i64> @test182(<2 x i64> %a, <2 x i64> %b) {
 ;
 ; AVX2-LABEL: test182:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -9657,7 +9657,7 @@ define <2 x i64> @test183(<2 x i64> %a, <2 x i64> %b) {
 ;
 ; AVX2-LABEL: test183:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -9725,7 +9725,7 @@ define <2 x i64> @test184(<2 x i64> %a, <2 x i64> %b) {
 ;
 ; AVX2-LABEL: test184:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -10027,7 +10027,7 @@ define <2 x i64> @test189(<2 x i64> %a, <2 x i64> %b) {
 ;
 ; AVX2-LABEL: test189:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -10095,7 +10095,7 @@ define <2 x i64> @test190(<2 x i64> %a, <2 x i64> %b) {
 ;
 ; AVX2-LABEL: test190:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -10163,7 +10163,7 @@ define <2 x i64> @test191(<2 x i64> %a, <2 x i64> %b) {
 ;
 ; AVX2-LABEL: test191:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
@@ -10231,7 +10231,7 @@ define <2 x i64> @test192(<2 x i64> %a, <2 x i64> %b) {
 ;
 ; AVX2-LABEL: test192:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2

diff  --git a/llvm/test/CodeGen/X86/vselect-pcmp.ll b/llvm/test/CodeGen/X86/vselect-pcmp.ll
index a0573a449646d..7a9b66d026a44 100644
--- a/llvm/test/CodeGen/X86/vselect-pcmp.ll
+++ b/llvm/test/CodeGen/X86/vselect-pcmp.ll
@@ -681,7 +681,7 @@ define <2 x i64> @blend_splatmax_mask_cond_v2i64(<2 x i64> %x, <2 x i64> %y, <2
 ; AVX512F-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
 ; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
 ; AVX512F-NEXT:    vptestnmq %zmm3, %zmm0, %k1
 ; AVX512F-NEXT:    vpblendmq %zmm1, %zmm2, %zmm0 {%k1}
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0

diff  --git a/llvm/test/CodeGen/X86/vselect-post-combine.ll b/llvm/test/CodeGen/X86/vselect-post-combine.ll
index fdbc361e85d22..e91b8d029bcb4 100644
--- a/llvm/test/CodeGen/X86/vselect-post-combine.ll
+++ b/llvm/test/CodeGen/X86/vselect-post-combine.ll
@@ -5,7 +5,7 @@ define ptr @test_mul(ptr %addr) {
 ; AVX2-LABEL: test_mul:
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = [255,0,0,0]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX2-NEXT:    vpblendvb %xmm0, (%rdi), %xmm1, %xmm0
 ; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX2-NEXT:    vmovdqu %ymm0, 0

diff  --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index be720f59d978e..2fd2afdee8c11 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -483,39 +483,39 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind {
 ; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm2
 ; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
 ; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX2-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
 ; AVX2-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
 ; AVX2-NEXT:    vpshufb %xmm5, %xmm2, %xmm5
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX2-NEXT:    vpshufb %xmm6, %xmm1, %xmm7
 ; AVX2-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
 ; AVX2-NEXT:    vpcmpeqb %xmm5, %xmm4, %xmm4
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
 ; AVX2-NEXT:    vpshufb %xmm5, %xmm2, %xmm5
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX2-NEXT:    vpshufb %xmm6, %xmm1, %xmm7
 ; AVX2-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX2-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
 ; AVX2-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -648,16 +648,16 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
 ; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm4
 ; AVX2-NEXT:    vmovdqa 48(%rdi), %xmm5
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX2-NEXT:    vpshufb %xmm6, %xmm5, %xmm7
 ; AVX2-NEXT:    vpshufb %xmm6, %xmm4, %xmm6
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX2-NEXT:    vpshufb %xmm7, %xmm3, %xmm8
 ; AVX2-NEXT:    vpshufb %xmm7, %xmm2, %xmm7
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm8 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX2-NEXT:    vpshufb %ymm8, %ymm1, %ymm9
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4]
 ; AVX2-NEXT:    vpermd %ymm9, %ymm6, %ymm9
@@ -665,16 +665,16 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
 ; AVX2-NEXT:    vpermd %ymm8, %ymm6, %ymm8
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX2-NEXT:    vpshufb %xmm8, %xmm5, %xmm9
 ; AVX2-NEXT:    vpshufb %xmm8, %xmm4, %xmm8
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm9 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX2-NEXT:    vpshufb %xmm9, %xmm3, %xmm10
 ; AVX2-NEXT:    vpshufb %xmm9, %xmm2, %xmm9
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
 ; AVX2-NEXT:    vpshufb %ymm9, %ymm1, %ymm10
 ; AVX2-NEXT:    vpermd %ymm10, %ymm6, %ymm10
 ; AVX2-NEXT:    vpshufb %ymm9, %ymm0, %ymm9
@@ -682,32 +682,32 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
 ; AVX2-NEXT:    vpcmpeqb %ymm7, %ymm8, %ymm7
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX2-NEXT:    vpshufb %xmm8, %xmm5, %xmm9
 ; AVX2-NEXT:    vpshufb %xmm8, %xmm4, %xmm8
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm9 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX2-NEXT:    vpshufb %xmm9, %xmm3, %xmm10
 ; AVX2-NEXT:    vpshufb %xmm9, %xmm2, %xmm9
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
 ; AVX2-NEXT:    vpshufb %ymm9, %ymm1, %ymm10
 ; AVX2-NEXT:    vpermd %ymm10, %ymm6, %ymm10
 ; AVX2-NEXT:    vpshufb %ymm9, %ymm0, %ymm9
 ; AVX2-NEXT:    vpermd %ymm9, %ymm6, %ymm9
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX2-NEXT:    vpshufb %xmm9, %xmm5, %xmm5
 ; AVX2-NEXT:    vpshufb %xmm9, %xmm4, %xmm4
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
 ; AVX2-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
 ; AVX2-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vpermd %ymm1, %ymm6, %ymm1
 ; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
@@ -830,7 +830,8 @@ define <32 x i8> @interleaved_load_vf32_i8_stride3(ptr %ptr){
 ; AVX2OR512-NEXT:    vinserti128 $1, 48(%rdi), %ymm0, %ymm0
 ; AVX2OR512-NEXT:    vinserti128 $1, 64(%rdi), %ymm1, %ymm1
 ; AVX2OR512-NEXT:    vinserti128 $1, 80(%rdi), %ymm2, %ymm2
-; AVX2OR512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX2OR512-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX2OR512-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2OR512-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX2OR512-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX2OR512-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
@@ -838,7 +839,8 @@ define <32 x i8> @interleaved_load_vf32_i8_stride3(ptr %ptr){
 ; AVX2OR512-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
 ; AVX2OR512-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
 ; AVX2OR512-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
-; AVX2OR512-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; AVX2OR512-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; AVX2OR512-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX2OR512-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm1
 ; AVX2OR512-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
 ; AVX2OR512-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
@@ -1025,7 +1027,8 @@ define void @interleaved_store_vf32_i8_stride3(<32 x i8> %a, <32 x i8> %b, <32 x
 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20]
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
@@ -1206,7 +1209,8 @@ define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x
 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm4 = ymm11[5,6,7,8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4],ymm11[21,22,23,24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20]
 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm5 = ymm12[5,6,7,8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4],ymm12[21,22,23,24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20]
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm6
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm7, %ymm6, %ymm6
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-NEXT:    vpshufb %ymm7, %ymm2, %ymm2
@@ -1399,7 +1403,8 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){
 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255]
 ; AVX2-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpblendvb %ymm5, %ymm6, %ymm2, %ymm7
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm8 = [1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14]
+; AVX2-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm8, %ymm7, %ymm7
 ; AVX2-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm5
 ; AVX2-NEXT:    vpshufb %ymm8, %ymm5, %ymm5
@@ -1408,7 +1413,8 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){
 ; AVX2-NEXT:    vpblendvb %ymm8, %ymm1, %ymm3, %ymm3
 ; AVX2-NEXT:    vpblendvb %ymm8, %ymm0, %ymm6, %ymm6
 ; AVX2-NEXT:    vpblendvb %ymm8, %ymm2, %ymm0, %ymm9
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm10 = <1,4,7,10,13,0,3,6,9,12,15,u,u,u,u,u,1,4,7,10,13,0,3,6,9,12,15,u,u,u,u,u>
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0,1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0]
+; AVX2-NEXT:    # ymm10 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb %ymm10, %ymm9, %ymm9
 ; AVX2-NEXT:    vpblendvb %ymm8, %ymm4, %ymm1, %ymm8
 ; AVX2-NEXT:    vpshufb %ymm10, %ymm8, %ymm8
@@ -1425,7 +1431,7 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){
 ; AVX2-NEXT:    vpor %ymm4, %ymm3, %ymm3
 ; AVX2-NEXT:    vpaddb %ymm3, %ymm8, %ymm3
 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm4 = ymm7[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,2,5,8,11,14,u,u,u,u,u,u,u,u,u,u,u,2,5,8,11,14,u,u,u,u,u>
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5]
 ; AVX2-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255]
 ; AVX2-NEXT:    # ymm7 = mem[0,1,0,1]
@@ -1454,7 +1460,8 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){
 ; AVX512-NEXT:    vinserti128 $1, 176(%rdi), %ymm5, %ymm3
 ; AVX512-NEXT:    vinserti128 $1, 80(%rdi), %ymm2, %ymm2
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX512-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512-NEXT:    vpshufb %zmm3, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufb %zmm3, %zmm1, %zmm1
 ; AVX512-NEXT:    vpshufb %zmm3, %zmm2, %zmm2

diff  --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
index 86737f28e28cc..8d6c1483d817f 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
@@ -3092,7 +3092,8 @@ define void @vec384_v48i8_to_v8i48_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bia
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,u,u,1,u,u,2,u,u,3,u,u,4,u,u,5>
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,3,0,1,4,0,2,5,0,3,0,1,4,0,2,5]
+; AVX512BW-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermw %ymm1, %ymm2, %ymm1
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero

diff  --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index 85bec77fe5eb2..c93d4a60898e5 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -1221,7 +1221,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX2-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX2-NEXT:    vpbroadcastd %xmm1, %ymm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
@@ -1346,7 +1346,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX2-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
@@ -1466,7 +1466,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i
 ; AVX2-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
@@ -2429,7 +2430,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[0,u,0,u,0,u,0,u,0,u,0,u,0,u,0,u,16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
 ; AVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
@@ -2579,7 +2580,8 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
@@ -2723,7 +2725,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[0,u,u,u,0,u,u,u,0,u,u,u,0,u,u,u,16],zero,zero,zero,ymm2[16],zero,zero,zero,ymm2[16],zero,zero,zero,ymm2[16],zero,zero,zero
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
 ; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
@@ -2874,7 +2876,8 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
@@ -3018,7 +3021,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[0,u,u,u,u,u,u,u,0,u,u,u,u,u,u,u,16],zero,zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
@@ -3168,7 +3171,8 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
@@ -3310,7 +3314,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
@@ -3455,10 +3460,12 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX2-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX2-NEXT:    vpbroadcastb %xmm1, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255>
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vmovaps 32(%rdx), %ymm1
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
@@ -5227,7 +5234,8 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
 ; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <16,29,30,31,4,5,16,7,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [16,29,30,31,4,5,16,7,16,29,30,31,4,5,16,7]
+; AVX512F-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm2
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm2, %ymm0
@@ -5244,7 +5252,8 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
 ; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <16,29,30,31,4,5,16,7,u,u,u,u,u,u,u,u>
+; AVX512DQ-NEXT:    vbroadcasti32x8 {{.*#+}} zmm1 = [16,29,30,31,4,5,16,7,16,29,30,31,4,5,16,7]
+; AVX512DQ-NEXT:    # zmm1 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm1, %zmm2
 ; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm2, %ymm0
@@ -5406,7 +5415,8 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
 ; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
 ; AVX512BW-FAST:       # %bb.0:
 ; AVX512BW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,7,0,7]
+; AVX512BW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,7,0,7]
+; AVX512BW-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
 ; AVX512BW-FAST-NEXT:    vpermq %zmm0, %zmm1, %zmm1
 ; AVX512BW-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -5508,7 +5518,8 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,7,10,0,u,u,u,u>
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,7,10,0,0,7,10,0]
+; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm2, %ymm0
 ; AVX512F-NEXT:    vmovaps 32(%rdx), %ymm1
@@ -5525,7 +5536,8 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,7,10,0,u,u,u,u>
+; AVX512DQ-NEXT:    vbroadcasti32x8 {{.*#+}} zmm2 = [0,7,10,0,0,7,10,0]
+; AVX512DQ-NEXT:    # zmm2 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm2, %ymm0
 ; AVX512DQ-NEXT:    vmovaps 32(%rdx), %ymm1

diff  --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
index 2e00b5c9c91a5..0ea821b11bc96 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -1012,7 +1012,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
 ; AVX2-NEXT:    vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rdx)
@@ -1110,7 +1110,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.e
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rdx)
@@ -1202,7 +1202,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i
 ; AVX2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rdx)
@@ -1909,7 +1910,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,u,0,u,0,u,0,u,0,u,0,u,0,u,0,u,16],zero,ymm1[16],zero,ymm1[16],zero,ymm1[16],zero,ymm1[16],zero,ymm1[16],zero,ymm1[16],zero,ymm1[16],zero
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm1[2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
 ; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
@@ -2039,7 +2040,8 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm0[2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
@@ -2161,7 +2163,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,u,u,u,0,u,u,u,0,u,u,u,0,u,u,u,16],zero,zero,zero,ymm1[16],zero,zero,zero,ymm1[16],zero,zero,zero,ymm1[16],zero,zero,zero
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm1[2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
 ; AVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
@@ -2291,7 +2293,8 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm0[2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
@@ -2413,7 +2416,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e
 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,u,u,u,u,u,u,u,0,u,u,u,u,u,u,u,16],zero,zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm1[2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
@@ -2543,7 +2546,8 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm0[2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
@@ -2664,7 +2668,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm1[2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
@@ -2788,10 +2793,12 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255>
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = mem[2,3],ymm0[2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovaps 32(%rsi), %ymm1
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
@@ -4194,7 +4201,8 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
 ;
 ; AVX512F-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <16,29,30,31,4,5,16,7,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [16,29,30,31,4,5,16,7,16,29,30,31,4,5,16,7]
+; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpermt2d (%rdi), %zmm0, %zmm1
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm1, %ymm0
@@ -4206,7 +4214,8 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
 ;
 ; AVX512DQ-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <16,29,30,31,4,5,16,7,u,u,u,u,u,u,u,u>
+; AVX512DQ-NEXT:    vbroadcasti32x8 {{.*#+}} zmm0 = [16,29,30,31,4,5,16,7,16,29,30,31,4,5,16,7]
+; AVX512DQ-NEXT:    # zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vpermt2d (%rdi), %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm1, %ymm0
@@ -4413,7 +4422,8 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,7,10,0,u,u,u,u>
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,7,10,0,0,7,10,0]
+; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm2, %ymm0
 ; AVX512F-NEXT:    vmovaps 32(%rsi), %ymm1
@@ -4426,7 +4436,8 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,7,10,0,u,u,u,u>
+; AVX512DQ-NEXT:    vbroadcasti32x8 {{.*#+}} zmm2 = [0,7,10,0,0,7,10,0]
+; AVX512DQ-NEXT:    # zmm2 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm2, %ymm0
 ; AVX512DQ-NEXT:    vmovaps 32(%rsi), %ymm1


        


More information about the llvm-commits mailing list