[llvm] r291665 - [X86][AVX512BW] Vectorize v64i8 vector shifts

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Wed Jan 11 02:36:51 PST 2017


Author: rksimon
Date: Wed Jan 11 04:36:51 2017
New Revision: 291665

URL: http://llvm.org/viewvc/llvm-project?rev=291665&view=rev
Log:
[X86][AVX512BW] Vectorize v64i8 vector shifts

Differential Revision: https://reviews.llvm.org/D28447

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
    llvm/trunk/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
    llvm/trunk/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
    llvm/trunk/test/Analysis/CostModel/X86/vshift-shl-cost.ll
    llvm/trunk/test/CodeGen/X86/vector-shift-ashr-512.ll
    llvm/trunk/test/CodeGen/X86/vector-shift-lshr-512.ll
    llvm/trunk/test/CodeGen/X86/vector-shift-shl-512.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=291665&r1=291664&r2=291665&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Jan 11 04:36:51 2017
@@ -21752,14 +21752,26 @@ static SDValue LowerShift(SDValue Op, co
   }
 
   if (VT == MVT::v16i8 ||
-      (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP())) {
+      (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
+      (VT == MVT::v64i8 && Subtarget.hasBWI())) {
     MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
     unsigned ShiftOpcode = Op->getOpcode();
 
     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
-      // On SSE41 targets we make use of the fact that VSELECT lowers
-      // to PBLENDVB which selects bytes based just on the sign bit.
-      if (Subtarget.hasSSE41()) {
+      if (VT.is512BitVector()) {
+        // On AVX512BW targets we make use of the fact that VSELECT lowers
+        // to a masked blend which selects bytes based just on the sign bit
+        // extracted to a mask.
+        MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+        V0 = DAG.getBitcast(VT, V0);
+        V1 = DAG.getBitcast(VT, V1);
+        Sel = DAG.getBitcast(VT, Sel);
+        Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
+        return DAG.getBitcast(SelVT,
+                              DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
+      } else if (Subtarget.hasSSE41()) {
+        // On SSE41 targets we make use of the fact that VSELECT lowers
+        // to PBLENDVB which selects bytes based just on the sign bit.
         V0 = DAG.getBitcast(VT, V0);
         V1 = DAG.getBitcast(VT, V1);
         Sel = DAG.getBitcast(VT, Sel);

Modified: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp?rev=291665&r1=291664&r2=291665&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp Wed Jan 11 04:36:51 2017
@@ -323,6 +323,10 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::SRL,   MVT::v32i16,     1 }, // vpsrlvw
     { ISD::SRA,   MVT::v32i16,     1 }, // vpsravw
 
+    { ISD::SHL,   MVT::v64i8,     11 }, // vpblendvb sequence.
+    { ISD::SRL,   MVT::v64i8,     11 }, // vpblendvb sequence.
+    { ISD::SRA,   MVT::v64i8,     24 }, // vpblendvb sequence.
+
     { ISD::MUL,   MVT::v64i8,     11 }, // extend/pmullw/trunc sequence.
     { ISD::MUL,   MVT::v32i8,      4 }, // extend/pmullw/trunc sequence.
     { ISD::MUL,   MVT::v16i8,      4 }, // extend/pmullw/trunc sequence.

Modified: llvm/trunk/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/X86/vshift-ashr-cost.ll?rev=291665&r1=291664&r2=291665&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/X86/vshift-ashr-cost.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/X86/vshift-ashr-cost.ll Wed Jan 11 04:36:51 2017
@@ -165,9 +165,9 @@ define <64 x i8> @var_shift_v64i8(<64 x
 ; AVX: Found an estimated cost of 96 for instruction:   %shift
 ; AVX2: Found an estimated cost of 48 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 48 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 24 for instruction:   %shift
 ; AVX512VL: Found an estimated cost of 48 for instruction:   %shift
-; AVX512BWVL: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 24 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %shift = ashr <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -333,9 +333,9 @@ define <64 x i8> @splatvar_shift_v64i8(<
 ; AVX: Found an estimated cost of 96 for instruction:   %shift
 ; AVX2: Found an estimated cost of 48 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 48 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 24 for instruction:   %shift
 ; AVX512VL: Found an estimated cost of 48 for instruction:   %shift
-; AVX512BWVL: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 24 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
   %shift = ashr <64 x i8> %a, %splat
@@ -491,9 +491,9 @@ define <64 x i8> @constant_shift_v64i8(<
 ; AVX: Found an estimated cost of 96 for instruction:   %shift
 ; AVX2: Found an estimated cost of 48 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 48 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 24 for instruction:   %shift
 ; AVX512VL: Found an estimated cost of 48 for instruction:   %shift
-; AVX512BWVL: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 24 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %shift = ashr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift

Modified: llvm/trunk/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/X86/vshift-lshr-cost.ll?rev=291665&r1=291664&r2=291665&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/X86/vshift-lshr-cost.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/X86/vshift-lshr-cost.ll Wed Jan 11 04:36:51 2017
@@ -165,9 +165,9 @@ define <64 x i8> @var_shift_v64i8(<64 x
 ; AVX: Found an estimated cost of 48 for instruction:   %shift
 ; AVX2: Found an estimated cost of 22 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 22 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 11 for instruction:   %shift
 ; AVX512VL: Found an estimated cost of 22 for instruction:   %shift
-; AVX512BWVL: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %shift = lshr <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -336,9 +336,9 @@ define <64 x i8> @splatvar_shift_v64i8(<
 ; AVX: Found an estimated cost of 48 for instruction:   %shift
 ; AVX2: Found an estimated cost of 22 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 22 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 11 for instruction:   %shift
 ; AVX512VL: Found an estimated cost of 22 for instruction:   %shift
-; AVX512BWVL: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
   %shift = lshr <64 x i8> %a, %splat
@@ -497,9 +497,9 @@ define <64 x i8> @constant_shift_v64i8(<
 ; AVX: Found an estimated cost of 48 for instruction:   %shift
 ; AVX2: Found an estimated cost of 22 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 22 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 11 for instruction:   %shift
 ; AVX512VL: Found an estimated cost of 22 for instruction:   %shift
-; AVX512BWVL: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift

Modified: llvm/trunk/test/Analysis/CostModel/X86/vshift-shl-cost.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/X86/vshift-shl-cost.ll?rev=291665&r1=291664&r2=291665&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/X86/vshift-shl-cost.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/X86/vshift-shl-cost.ll Wed Jan 11 04:36:51 2017
@@ -164,9 +164,9 @@ define <64 x i8> @var_shift_v64i8(<64 x
 ; AVX: Found an estimated cost of 44 for instruction:   %shift
 ; AVX2: Found an estimated cost of 22 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 22 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 11 for instruction:   %shift
 ; AVX512VL: Found an estimated cost of 22 for instruction:   %shift
-; AVX512BWVL: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = shl <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -335,9 +335,9 @@ define <64 x i8> @splatvar_shift_v64i8(<
 ; AVX: Found an estimated cost of 44 for instruction:   %shift
 ; AVX2: Found an estimated cost of 22 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 22 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 11 for instruction:   %shift
 ; AVX512VL: Found an estimated cost of 22 for instruction:   %shift
-; AVX512BWVL: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
   %shift = shl <64 x i8> %a, %splat
@@ -498,9 +498,9 @@ define <64 x i8> @constant_shift_v64i8(<
 ; AVX: Found an estimated cost of 44 for instruction:   %shift
 ; AVX2: Found an estimated cost of 22 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 22 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 11 for instruction:   %shift
 ; AVX512VL: Found an estimated cost of 22 for instruction:   %shift
-; AVX512BWVL: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift

Modified: llvm/trunk/test/CodeGen/X86/vector-shift-ashr-512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shift-ashr-512.ll?rev=291665&r1=291664&r2=291665&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shift-ashr-512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shift-ashr-512.ll Wed Jan 11 04:36:51 2017
@@ -100,399 +100,36 @@ define <64 x i8> @var_shift_v64i8(<64 x
 ;
 ; AVX512BW-LABEL: var_shift_v64i8:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %ecx
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm2
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm1, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %esi
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %sil
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %sil, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $1, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm0, %esi
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %sil
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    movzbl %sil, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $5, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $7, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $11, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $13, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT:    vpsraw $4, %zmm2, %zmm3
+; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpsraw $2, %zmm2, %zmm3
+; AVX512BW-NEXT:    vpaddw %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpsraw $1, %zmm2, %zmm3
+; AVX512BW-NEXT:    vpaddw %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpsraw $4, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsraw $2, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsraw $1, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %shift = ashr <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -591,399 +228,36 @@ define <64 x i8> @splatvar_shift_v64i8(<
 ; AVX512BW-LABEL: splatvar_shift_v64i8:
 ; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpbroadcastb %xmm1, %zmm1
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm2
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm1, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %esi
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %sil
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %sil, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $1, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $0, %xmm0, %esi
-; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %sil
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    movzbl %sil, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $5, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $13, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT:    vpsraw $4, %zmm2, %zmm3
+; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpsraw $2, %zmm2, %zmm3
+; AVX512BW-NEXT:    vpaddw %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpsraw $1, %zmm2, %zmm3
+; AVX512BW-NEXT:    vpaddw %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpsraw $4, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsraw $2, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsraw $1, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
   %shift = ashr <64 x i8> %a, %splat
@@ -1081,252 +355,36 @@ define <64 x i8> @constant_shift_v64i8(<
 ;
 ; AVX512BW-LABEL: constant_shift_v64i8:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BW-NEXT:    vpextrb $0, %xmm1, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm2
-; AVX512BW-NEXT:    vpextrb $1, %xmm1, %eax
-; AVX512BW-NEXT:    sarb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $2, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $3, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $4, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $5, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $6, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $7, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $8, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $9, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $10, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $11, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $12, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $13, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $14, %xmm1, %eax
-; AVX512BW-NEXT:    sarb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $15, %xmm1, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm1
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT:    sarb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT:    sarb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT:    sarb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT:    sarb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm0, %eax
-; AVX512BW-NEXT:    sarb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $2, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $5, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $6, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $9, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $10, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $13, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $14, %xmm0, %eax
-; AVX512BW-NEXT:    sarb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT:    vpsraw $4, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpsllw $5, {{.*}}(%rip), %zmm3
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm0[8],zmm3[8],zmm0[9],zmm3[9],zmm0[10],zmm3[10],zmm0[11],zmm3[11],zmm0[12],zmm3[12],zmm0[13],zmm3[13],zmm0[14],zmm3[14],zmm0[15],zmm3[15],zmm0[24],zmm3[24],zmm0[25],zmm3[25],zmm0[26],zmm3[26],zmm0[27],zmm3[27],zmm0[28],zmm3[28],zmm0[29],zmm3[29],zmm0[30],zmm3[30],zmm0[31],zmm3[31],zmm0[40],zmm3[40],zmm0[41],zmm3[41],zmm0[42],zmm3[42],zmm0[43],zmm3[43],zmm0[44],zmm3[44],zmm0[45],zmm3[45],zmm0[46],zmm3[46],zmm0[47],zmm3[47],zmm0[56],zmm3[56],zmm0[57],zmm3[57],zmm0[58],zmm3[58],zmm0[59],zmm3[59],zmm0[60],zmm3[60],zmm0[61],zmm3[61],zmm0[62],zmm3[62],zmm0[63],zmm3[63]
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm1 {%k1}
+; AVX512BW-NEXT:    vpsraw $2, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpaddw %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm1 {%k1}
+; AVX512BW-NEXT:    vpsraw $1, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpaddw %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm1 {%k1}
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpsraw $4, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm3 = zmm0[0],zmm3[0],zmm0[1],zmm3[1],zmm0[2],zmm3[2],zmm0[3],zmm3[3],zmm0[4],zmm3[4],zmm0[5],zmm3[5],zmm0[6],zmm3[6],zmm0[7],zmm3[7],zmm0[16],zmm3[16],zmm0[17],zmm3[17],zmm0[18],zmm3[18],zmm0[19],zmm3[19],zmm0[20],zmm3[20],zmm0[21],zmm3[21],zmm0[22],zmm3[22],zmm0[23],zmm3[23],zmm0[32],zmm3[32],zmm0[33],zmm3[33],zmm0[34],zmm3[34],zmm0[35],zmm3[35],zmm0[36],zmm3[36],zmm0[37],zmm3[37],zmm0[38],zmm3[38],zmm0[39],zmm3[39],zmm0[48],zmm3[48],zmm0[49],zmm3[49],zmm0[50],zmm3[50],zmm0[51],zmm3[51],zmm0[52],zmm3[52],zmm0[53],zmm3[53],zmm0[54],zmm3[54],zmm0[55],zmm3[55]
+; AVX512BW-NEXT:    vpmovb2m %zmm3, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsraw $2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpaddw %zmm3, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpmovb2m %zmm3, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsraw $1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpaddw %zmm3, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpmovb2m %zmm3, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %shift = ashr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift

Modified: llvm/trunk/test/CodeGen/X86/vector-shift-lshr-512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shift-lshr-512.ll?rev=291665&r1=291664&r2=291665&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shift-lshr-512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shift-lshr-512.ll Wed Jan 11 04:36:51 2017
@@ -79,399 +79,21 @@ define <64 x i8> @var_shift_v64i8(<64 x
 ;
 ; AVX512BW-LABEL: var_shift_v64i8:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %ecx
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm2
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm1, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %esi
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %sil
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %sil, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $1, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm0, %esi
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %sil
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    movzbl %sil, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $5, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $7, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $11, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $13, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
   %shift = lshr <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -553,399 +175,21 @@ define <64 x i8> @splatvar_shift_v64i8(<
 ; AVX512BW-LABEL: splatvar_shift_v64i8:
 ; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpbroadcastb %xmm1, %zmm1
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm2
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm1, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %esi
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %sil
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %sil, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $1, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $0, %xmm0, %esi
-; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %sil
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    movzbl %sil, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $5, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $13, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
   %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
   %shift = lshr <64 x i8> %a, %splat
@@ -1026,252 +270,21 @@ define <64 x i8> @constant_shift_v64i8(<
 ;
 ; AVX512BW-LABEL: constant_shift_v64i8:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BW-NEXT:    vpextrb $0, %xmm1, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm2
-; AVX512BW-NEXT:    vpextrb $1, %xmm1, %eax
-; AVX512BW-NEXT:    shrb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $2, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $3, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $4, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $5, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $6, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $7, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $8, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $9, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $10, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $11, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $12, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $13, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $14, %xmm1, %eax
-; AVX512BW-NEXT:    shrb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $15, %xmm1, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm1
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT:    shrb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT:    shrb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT:    shrb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT:    shrb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm0, %eax
-; AVX512BW-NEXT:    shrb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $2, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $5, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $6, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $9, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $10, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $13, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $14, %xmm0, %eax
-; AVX512BW-NEXT:    shrb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsllw $5, {{.*}}(%rip), %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
   %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift

Modified: llvm/trunk/test/CodeGen/X86/vector-shift-shl-512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shift-shl-512.ll?rev=291665&r1=291664&r2=291665&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shift-shl-512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shift-shl-512.ll Wed Jan 11 04:36:51 2017
@@ -76,399 +76,19 @@ define <64 x i8> @var_shift_v64i8(<64 x
 ;
 ; AVX512BW-LABEL: var_shift_v64i8:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %ecx
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm2
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm1, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %esi
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %sil
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %sil, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $1, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm0, %esi
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %sil
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    movzbl %sil, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $5, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $7, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $11, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $13, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsllw $2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm0, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
   %shift = shl <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -547,399 +167,19 @@ define <64 x i8> @splatvar_shift_v64i8(<
 ; AVX512BW-LABEL: splatvar_shift_v64i8:
 ; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpbroadcastb %xmm1, %zmm1
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm2
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm1, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %esi
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %sil
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %sil, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $1, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $0, %xmm0, %esi
-; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %sil
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    movzbl %sil, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $5, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $13, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsllw $2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm0, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
   %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
   %shift = shl <64 x i8> %a, %splat
@@ -1013,252 +253,19 @@ define <64 x i8> @constant_shift_v64i8(<
 ;
 ; AVX512BW-LABEL: constant_shift_v64i8:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BW-NEXT:    vpextrb $0, %xmm1, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm2
-; AVX512BW-NEXT:    vpextrb $1, %xmm1, %eax
-; AVX512BW-NEXT:    addb %al, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $2, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $3, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $4, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $5, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $6, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $7, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $8, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $9, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $10, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $11, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $12, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $13, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $14, %xmm1, %eax
-; AVX512BW-NEXT:    addb %al, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $15, %xmm1, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm1
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT:    addb %al, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT:    addb %al, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT:    addb %al, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT:    addb %al, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm0, %eax
-; AVX512BW-NEXT:    addb %al, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $2, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $5, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $6, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $9, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $10, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $13, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $14, %xmm0, %eax
-; AVX512BW-NEXT:    addb %al, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsllw $5, {{.*}}(%rip), %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsllw $2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm0, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
   %shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift




More information about the llvm-commits mailing list