[llvm] 88dacbd - [X86] Go back to considering v64i1 as a legal type under min-legal-vector-width=256. Scalarize v64i1 arguments and shuffles under min-legal-vector-width=256.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Tue Dec 10 15:08:16 PST 2019


Author: Craig Topper
Date: 2019-12-10T15:07:55-08:00
New Revision: 88dacbd43625cf7aad8a01c0c3b92142c4dc0970

URL: https://github.com/llvm/llvm-project/commit/88dacbd43625cf7aad8a01c0c3b92142c4dc0970
DIFF: https://github.com/llvm/llvm-project/commit/88dacbd43625cf7aad8a01c0c3b92142c4dc0970.diff

LOG: [X86] Go back to considering v64i1 as a legal type under min-legal-vector-width=256. Scalarize v64i1 arguments and shuffles under min-legal-vector-width=256.

This reverts 3e1aee2ba717529b651a79ed4fc7e7147358043f in favor
of a different approach.

Scalarizing isn't great codegen, but making the type illegal was
interfering with k constraint in inline assembly.

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/min-legal-vector-width.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7733ad661625..703a3af19184 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1638,32 +1638,38 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     }
   }
 
-  // This block control legalization of v32i1 which is available with
+  // This block control legalization of v32i1/v64i1 which are available with
   // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
-  // useBWIRegs. v64i1 is also controled with useBWIRegs.
+  // useBWIRegs.
   if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
+    addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
 
-    setOperationAction(ISD::ADD,                MVT::v32i1, Custom);
-    setOperationAction(ISD::SUB,                MVT::v32i1, Custom);
-    setOperationAction(ISD::MUL,                MVT::v32i1, Custom);
-    setOperationAction(ISD::VSELECT,            MVT::v32i1, Expand);
-    setOperationAction(ISD::UADDSAT,            MVT::v32i1, Custom);
-    setOperationAction(ISD::SADDSAT,            MVT::v32i1, Custom);
-    setOperationAction(ISD::USUBSAT,            MVT::v32i1, Custom);
-    setOperationAction(ISD::SSUBSAT,            MVT::v32i1, Custom);
-
-    setOperationAction(ISD::TRUNCATE,           MVT::v32i1, Custom);
-    setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
-    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom);
-    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i1, Custom);
-    setOperationAction(ISD::SELECT,             MVT::v32i1, Custom);
-    setOperationAction(ISD::BUILD_VECTOR,       MVT::v32i1, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i1, Custom);
+    for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
+      setOperationAction(ISD::ADD,                VT, Custom);
+      setOperationAction(ISD::SUB,                VT, Custom);
+      setOperationAction(ISD::MUL,                VT, Custom);
+      setOperationAction(ISD::VSELECT,            VT, Expand);
+      setOperationAction(ISD::UADDSAT,            VT, Custom);
+      setOperationAction(ISD::SADDSAT,            VT, Custom);
+      setOperationAction(ISD::USUBSAT,            VT, Custom);
+      setOperationAction(ISD::SSUBSAT,            VT, Custom);
+
+      setOperationAction(ISD::TRUNCATE,           VT, Custom);
+      setOperationAction(ISD::SETCC,              VT, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
+      setOperationAction(ISD::SELECT,             VT, Custom);
+      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
+    }
 
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i1, Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i1, Custom);
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i1, Custom);
-    setOperationAction(ISD::EXTRACT_SUBVECTOR,  MVT::v16i1, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i1, Custom);
+    for (auto VT : { MVT::v16i1, MVT::v32i1 })
+      setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
 
     // Extends from v32i1 masks to 256-bit vectors.
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);
@@ -1753,34 +1759,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
       setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
     }
-
-    // Only support v64i1 if we support v64i8. Without 64i8 we won't have any
-    // operations that can produce these values other than concatenating
-    // v32i1 vectors together. And we don't have any masked operations that
-    // need a v64i1. By making it legal we avoid needing to lower arbitrary
-    // shuffles of v64i1 which need v64i8 to be legal.
-    addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
-
-    setOperationAction(ISD::ADD,                MVT::v64i1, Custom);
-    setOperationAction(ISD::SUB,                MVT::v64i1, Custom);
-    setOperationAction(ISD::MUL,                MVT::v64i1, Custom);
-    setOperationAction(ISD::VSELECT,            MVT::v64i1, Expand);
-    setOperationAction(ISD::UADDSAT,            MVT::v64i1, Custom);
-    setOperationAction(ISD::SADDSAT,            MVT::v64i1, Custom);
-    setOperationAction(ISD::USUBSAT,            MVT::v64i1, Custom);
-    setOperationAction(ISD::SSUBSAT,            MVT::v64i1, Custom);
-
-    setOperationAction(ISD::TRUNCATE,           MVT::v64i1, Custom);
-    setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
-    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom);
-    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i1, Custom);
-    setOperationAction(ISD::SELECT,             MVT::v64i1, Custom);
-    setOperationAction(ISD::BUILD_VECTOR,       MVT::v64i1, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i1, Custom);
-
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i1, Custom);
-    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i1, Custom);
-    setOperationAction(ISD::EXTRACT_SUBVECTOR,  MVT::v32i1, Custom);
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
@@ -2020,6 +1998,7 @@ MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
       Subtarget.hasAVX512() &&
       (!isPowerOf2_32(VT.getVectorNumElements()) ||
        (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
+       (VT.getVectorNumElements() > 32 && !Subtarget.useBWIRegs()) ||
        (VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
     return MVT::i8;
   // FIXME: Should we just make these types legal and custom split operations?
@@ -2040,6 +2019,7 @@ unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
       Subtarget.hasAVX512() &&
       (!isPowerOf2_32(VT.getVectorNumElements()) ||
        (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
+       (VT.getVectorNumElements() > 32 && !Subtarget.useBWIRegs()) ||
        (VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
     return VT.getVectorNumElements();
   // FIXME: Should we just make these types legal and custom split operations?
@@ -2057,6 +2037,7 @@ unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
       Subtarget.hasAVX512() &&
       (!isPowerOf2_32(VT.getVectorNumElements()) ||
        (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
+       (VT.getVectorNumElements() > 32 && !Subtarget.useBWIRegs()) ||
        (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) {
     RegisterVT = MVT::i8;
     IntermediateVT = MVT::i1;
@@ -17041,6 +17022,10 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
     ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
     break;
   case MVT::v64i1:
+    // Fall back to scalarization. FIXME: We can do better if the shuffle
+    // can be partitioned cleanly.
+    if (!Subtarget.useBWIRegs())
+      return SDValue();
     ExtVT = MVT::v64i8;
     break;
   }

diff  --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
index 6e256c060d29..bf48a305a2ba 100644
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=fast-variable-shuffle,avx512vl,avx512bw,avx512dq,prefer-256-bit | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=fast-variable-shuffle,avx512vl,avx512bw,avx512dq,prefer-256-bit,avx512vbmi | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit,avx512vbmi | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI
 ; Make sure CPUs default to prefer-256-bit. avx512vnni isn't interesting as it just adds an isel peephole for vpmaddwd+vpaddd
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=cascadelake | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
@@ -1120,6 +1120,448 @@ define void @trunc_packus_v16i32_v16i8_store(<16 x i32>* %p, <16 x i8>* %q) "min
 define <64 x i1> @v64i1_argument_return(<64 x i1> %x) "min-legal-vector-width"="256" {
 ; CHECK-LABEL: v64i1_argument_return:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    kmovd %esi, %k0
+; CHECK-NEXT:    kshiftlq $63, %k0, %k0
+; CHECK-NEXT:    kshiftrq $63, %k0, %k0
+; CHECK-NEXT:    kshiftlq $2, %k0, %k1
+; CHECK-NEXT:    kmovd %edx, %k2
+; CHECK-NEXT:    kshiftlq $1, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $62, %k0, %k0
+; CHECK-NEXT:    kshiftrq $62, %k0, %k0
+; CHECK-NEXT:    kshiftlq $3, %k0, %k1
+; CHECK-NEXT:    kmovd %ecx, %k2
+; CHECK-NEXT:    kshiftlq $2, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $61, %k0, %k0
+; CHECK-NEXT:    kshiftrq $61, %k0, %k0
+; CHECK-NEXT:    kshiftlq $4, %k0, %k1
+; CHECK-NEXT:    kmovd %r8d, %k2
+; CHECK-NEXT:    kshiftlq $3, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $60, %k0, %k0
+; CHECK-NEXT:    kshiftrq $60, %k0, %k0
+; CHECK-NEXT:    kshiftlq $5, %k0, %k1
+; CHECK-NEXT:    kmovd %r9d, %k2
+; CHECK-NEXT:    kshiftlq $4, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $59, %k0, %k0
+; CHECK-NEXT:    kshiftrq $59, %k0, %k0
+; CHECK-NEXT:    kshiftlq $6, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $5, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $58, %k0, %k0
+; CHECK-NEXT:    kshiftrq $58, %k0, %k0
+; CHECK-NEXT:    kshiftlq $7, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $6, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $57, %k0, %k0
+; CHECK-NEXT:    kshiftrq $57, %k0, %k0
+; CHECK-NEXT:    kshiftlq $8, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $7, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $56, %k0, %k0
+; CHECK-NEXT:    kshiftrq $56, %k0, %k0
+; CHECK-NEXT:    kshiftlq $9, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $8, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $55, %k0, %k0
+; CHECK-NEXT:    kshiftrq $55, %k0, %k0
+; CHECK-NEXT:    kshiftlq $10, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $9, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $54, %k0, %k0
+; CHECK-NEXT:    kshiftrq $54, %k0, %k0
+; CHECK-NEXT:    kshiftlq $11, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $10, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $53, %k0, %k0
+; CHECK-NEXT:    kshiftrq $53, %k0, %k0
+; CHECK-NEXT:    kshiftlq $12, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $11, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $52, %k0, %k0
+; CHECK-NEXT:    kshiftrq $52, %k0, %k0
+; CHECK-NEXT:    kshiftlq $13, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $12, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $51, %k0, %k0
+; CHECK-NEXT:    kshiftrq $51, %k0, %k0
+; CHECK-NEXT:    kshiftlq $14, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $13, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $50, %k0, %k0
+; CHECK-NEXT:    kshiftrq $50, %k0, %k0
+; CHECK-NEXT:    kshiftlq $15, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $14, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $49, %k0, %k0
+; CHECK-NEXT:    kshiftrq $49, %k0, %k0
+; CHECK-NEXT:    kshiftlq $16, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $15, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $48, %k0, %k0
+; CHECK-NEXT:    kshiftrq $48, %k0, %k0
+; CHECK-NEXT:    kshiftlq $17, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $16, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $47, %k0, %k0
+; CHECK-NEXT:    kshiftrq $47, %k0, %k0
+; CHECK-NEXT:    kshiftlq $18, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $17, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $46, %k0, %k0
+; CHECK-NEXT:    kshiftrq $46, %k0, %k0
+; CHECK-NEXT:    kshiftlq $19, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $18, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $45, %k0, %k0
+; CHECK-NEXT:    kshiftrq $45, %k0, %k0
+; CHECK-NEXT:    kshiftlq $20, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $19, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $44, %k0, %k0
+; CHECK-NEXT:    kshiftrq $44, %k0, %k0
+; CHECK-NEXT:    kshiftlq $21, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $20, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $43, %k0, %k0
+; CHECK-NEXT:    kshiftrq $43, %k0, %k0
+; CHECK-NEXT:    kshiftlq $22, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $21, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $42, %k0, %k0
+; CHECK-NEXT:    kshiftrq $42, %k0, %k0
+; CHECK-NEXT:    kshiftlq $23, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $22, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $41, %k0, %k0
+; CHECK-NEXT:    kshiftrq $41, %k0, %k0
+; CHECK-NEXT:    kshiftlq $24, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $23, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $40, %k0, %k0
+; CHECK-NEXT:    kshiftrq $40, %k0, %k0
+; CHECK-NEXT:    kshiftlq $25, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $24, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $39, %k0, %k0
+; CHECK-NEXT:    kshiftrq $39, %k0, %k0
+; CHECK-NEXT:    kshiftlq $26, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $25, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $38, %k0, %k0
+; CHECK-NEXT:    kshiftrq $38, %k0, %k0
+; CHECK-NEXT:    kshiftlq $27, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $26, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $37, %k0, %k0
+; CHECK-NEXT:    kshiftrq $37, %k0, %k0
+; CHECK-NEXT:    kshiftlq $28, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $27, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $36, %k0, %k0
+; CHECK-NEXT:    kshiftrq $36, %k0, %k0
+; CHECK-NEXT:    kshiftlq $29, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $28, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $35, %k0, %k0
+; CHECK-NEXT:    kshiftrq $35, %k0, %k0
+; CHECK-NEXT:    kshiftlq $30, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $29, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $34, %k0, %k0
+; CHECK-NEXT:    kshiftrq $34, %k0, %k0
+; CHECK-NEXT:    kshiftlq $31, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $30, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $33, %k0, %k0
+; CHECK-NEXT:    kshiftrq $33, %k0, %k0
+; CHECK-NEXT:    kshiftlq $32, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $31, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $32, %k0, %k0
+; CHECK-NEXT:    kshiftrq $32, %k0, %k0
+; CHECK-NEXT:    kshiftlq $33, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $32, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $31, %k0, %k0
+; CHECK-NEXT:    kshiftrq $31, %k0, %k0
+; CHECK-NEXT:    kshiftlq $34, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $33, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $30, %k0, %k0
+; CHECK-NEXT:    kshiftrq $30, %k0, %k0
+; CHECK-NEXT:    kshiftlq $35, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $34, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $29, %k0, %k0
+; CHECK-NEXT:    kshiftrq $29, %k0, %k0
+; CHECK-NEXT:    kshiftlq $36, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $35, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $28, %k0, %k0
+; CHECK-NEXT:    kshiftrq $28, %k0, %k0
+; CHECK-NEXT:    kshiftlq $37, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $36, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $27, %k0, %k0
+; CHECK-NEXT:    kshiftrq $27, %k0, %k0
+; CHECK-NEXT:    kshiftlq $38, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $37, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $26, %k0, %k0
+; CHECK-NEXT:    kshiftrq $26, %k0, %k0
+; CHECK-NEXT:    kshiftlq $39, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $38, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $25, %k0, %k0
+; CHECK-NEXT:    kshiftrq $25, %k0, %k0
+; CHECK-NEXT:    kshiftlq $40, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $39, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $24, %k0, %k0
+; CHECK-NEXT:    kshiftrq $24, %k0, %k0
+; CHECK-NEXT:    kshiftlq $41, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $40, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $23, %k0, %k0
+; CHECK-NEXT:    kshiftrq $23, %k0, %k0
+; CHECK-NEXT:    kshiftlq $42, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $41, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $22, %k0, %k0
+; CHECK-NEXT:    kshiftrq $22, %k0, %k0
+; CHECK-NEXT:    kshiftlq $43, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $42, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $21, %k0, %k0
+; CHECK-NEXT:    kshiftrq $21, %k0, %k0
+; CHECK-NEXT:    kshiftlq $44, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $43, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $20, %k0, %k0
+; CHECK-NEXT:    kshiftrq $20, %k0, %k0
+; CHECK-NEXT:    kshiftlq $45, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $44, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $19, %k0, %k0
+; CHECK-NEXT:    kshiftrq $19, %k0, %k0
+; CHECK-NEXT:    kshiftlq $46, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $45, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $18, %k0, %k0
+; CHECK-NEXT:    kshiftrq $18, %k0, %k0
+; CHECK-NEXT:    kshiftlq $47, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $46, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $17, %k0, %k0
+; CHECK-NEXT:    kshiftrq $17, %k0, %k0
+; CHECK-NEXT:    kshiftlq $48, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $47, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $16, %k0, %k0
+; CHECK-NEXT:    kshiftrq $16, %k0, %k0
+; CHECK-NEXT:    kshiftlq $49, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $48, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $15, %k0, %k0
+; CHECK-NEXT:    kshiftrq $15, %k0, %k0
+; CHECK-NEXT:    kshiftlq $50, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $49, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $14, %k0, %k0
+; CHECK-NEXT:    kshiftrq $14, %k0, %k0
+; CHECK-NEXT:    kshiftlq $51, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $50, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $13, %k0, %k0
+; CHECK-NEXT:    kshiftrq $13, %k0, %k0
+; CHECK-NEXT:    kshiftlq $52, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $51, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $12, %k0, %k0
+; CHECK-NEXT:    kshiftrq $12, %k0, %k0
+; CHECK-NEXT:    kshiftlq $53, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $52, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $11, %k0, %k0
+; CHECK-NEXT:    kshiftrq $11, %k0, %k0
+; CHECK-NEXT:    kshiftlq $54, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $53, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $10, %k0, %k0
+; CHECK-NEXT:    kshiftrq $10, %k0, %k0
+; CHECK-NEXT:    kshiftlq $55, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $54, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $9, %k0, %k0
+; CHECK-NEXT:    kshiftrq $9, %k0, %k0
+; CHECK-NEXT:    kshiftlq $56, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $55, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $8, %k0, %k0
+; CHECK-NEXT:    kshiftrq $8, %k0, %k0
+; CHECK-NEXT:    kshiftlq $57, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $56, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $7, %k0, %k0
+; CHECK-NEXT:    kshiftrq $7, %k0, %k0
+; CHECK-NEXT:    kshiftlq $58, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $57, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $6, %k0, %k0
+; CHECK-NEXT:    kshiftrq $6, %k0, %k0
+; CHECK-NEXT:    kshiftlq $59, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $58, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $5, %k0, %k0
+; CHECK-NEXT:    kshiftrq $5, %k0, %k0
+; CHECK-NEXT:    kshiftlq $60, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $59, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $4, %k0, %k0
+; CHECK-NEXT:    kshiftrq $4, %k0, %k0
+; CHECK-NEXT:    kshiftlq $61, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $60, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $3, %k0, %k0
+; CHECK-NEXT:    kshiftrq $3, %k0, %k0
+; CHECK-NEXT:    kshiftlq $62, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $61, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $2, %k0, %k0
+; CHECK-NEXT:    kshiftrq $2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $63, %k0, %k1
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k2
+; CHECK-NEXT:    kshiftlq $62, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kmovb {{[0-9]+}}(%rsp), %k1
+; CHECK-NEXT:    kshiftlq $1, %k0, %k0
+; CHECK-NEXT:    kshiftrq $1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $63, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k0
+; CHECK-NEXT:    kmovq %k0, (%rdi)
 ; CHECK-NEXT:    retq
   ret <64 x i1> %x
 }
@@ -1127,19 +1569,451 @@ define <64 x i1> @v64i1_argument_return(<64 x i1> %x) "min-legal-vector-width"="
 define void @v64i1_shuffle(<64 x i8>* %x, <64 x i8>* %y) "min-legal-vector-width"="256" {
 ; CHECK-LABEL: v64i1_shuffle:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
-; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
+; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; CHECK-NEXT:    vptestnmb %ymm1, %ymm1, %k0
+; CHECK-NEXT:    kshiftrd $3, %k0, %k1
+; CHECK-NEXT:    kshiftlq $2, %k0, %k2
+; CHECK-NEXT:    kshiftlq $1, %k0, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $1, %k0, %k3
+; CHECK-NEXT:    kshiftlq $63, %k3, %k3
+; CHECK-NEXT:    kshiftrq $63, %k3, %k3
+; CHECK-NEXT:    korq %k2, %k3, %k2
+; CHECK-NEXT:    kshiftlq $3, %k0, %k3
+; CHECK-NEXT:    kshiftlq $2, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k3, %k1
+; CHECK-NEXT:    kshiftrd $2, %k0, %k3
+; CHECK-NEXT:    kshiftlq $62, %k2, %k2
+; CHECK-NEXT:    kshiftrq $62, %k2, %k2
+; CHECK-NEXT:    korq %k1, %k2, %k1
+; CHECK-NEXT:    kshiftlq $4, %k0, %k2
+; CHECK-NEXT:    kshiftlq $3, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $5, %k0, %k3
+; CHECK-NEXT:    kshiftlq $61, %k1, %k1
+; CHECK-NEXT:    kshiftrq $61, %k1, %k1
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftlq $5, %k0, %k2
+; CHECK-NEXT:    kshiftlq $4, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $4, %k0, %k3
+; CHECK-NEXT:    kshiftlq $60, %k1, %k1
+; CHECK-NEXT:    kshiftrq $60, %k1, %k1
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftlq $6, %k0, %k2
+; CHECK-NEXT:    kshiftlq $5, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $7, %k0, %k3
+; CHECK-NEXT:    kshiftlq $59, %k1, %k1
+; CHECK-NEXT:    kshiftrq $59, %k1, %k1
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftlq $7, %k0, %k2
+; CHECK-NEXT:    kshiftlq $6, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $6, %k0, %k3
+; CHECK-NEXT:    kshiftlq $58, %k1, %k1
+; CHECK-NEXT:    kshiftrq $58, %k1, %k1
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftlq $8, %k0, %k2
+; CHECK-NEXT:    kshiftlq $7, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $9, %k0, %k3
+; CHECK-NEXT:    kshiftlq $57, %k1, %k1
+; CHECK-NEXT:    kshiftrq $57, %k1, %k1
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftlq $9, %k0, %k2
+; CHECK-NEXT:    kshiftlq $8, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $8, %k0, %k3
+; CHECK-NEXT:    kshiftlq $56, %k1, %k1
+; CHECK-NEXT:    kshiftrq $56, %k1, %k1
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftlq $10, %k0, %k2
+; CHECK-NEXT:    kshiftlq $9, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $11, %k0, %k3
+; CHECK-NEXT:    kshiftlq $55, %k1, %k1
+; CHECK-NEXT:    kshiftrq $55, %k1, %k1
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftlq $11, %k0, %k2
+; CHECK-NEXT:    kshiftlq $10, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $10, %k0, %k3
+; CHECK-NEXT:    kshiftlq $54, %k1, %k1
+; CHECK-NEXT:    kshiftrq $54, %k1, %k1
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftlq $12, %k0, %k2
+; CHECK-NEXT:    kshiftlq $11, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $13, %k0, %k3
+; CHECK-NEXT:    kshiftlq $53, %k1, %k1
+; CHECK-NEXT:    kshiftrq $53, %k1, %k1
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftlq $13, %k0, %k2
+; CHECK-NEXT:    kshiftlq $12, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $12, %k0, %k3
+; CHECK-NEXT:    kshiftlq $52, %k1, %k1
+; CHECK-NEXT:    kshiftrq $52, %k1, %k1
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftlq $14, %k0, %k2
+; CHECK-NEXT:    kshiftlq $13, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $15, %k0, %k3
+; CHECK-NEXT:    kshiftlq $51, %k1, %k1
+; CHECK-NEXT:    kshiftrq $51, %k1, %k1
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftlq $15, %k0, %k2
+; CHECK-NEXT:    kshiftlq $14, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $14, %k0, %k3
+; CHECK-NEXT:    kshiftlq $50, %k1, %k1
+; CHECK-NEXT:    kshiftrq $50, %k1, %k1
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftlq $16, %k0, %k2
+; CHECK-NEXT:    kshiftlq $15, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $17, %k0, %k3
+; CHECK-NEXT:    kshiftlq $49, %k1, %k1
+; CHECK-NEXT:    kshiftrq $49, %k1, %k1
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftlq $17, %k0, %k2
+; CHECK-NEXT:    kshiftlq $16, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $16, %k0, %k3
+; CHECK-NEXT:    kshiftlq $48, %k1, %k1
+; CHECK-NEXT:    kshiftrq $48, %k1, %k1
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftlq $18, %k0, %k2
+; CHECK-NEXT:    kshiftlq $17, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $19, %k0, %k3
+; CHECK-NEXT:    kshiftlq $47, %k1, %k1
+; CHECK-NEXT:    kshiftrq $47, %k1, %k1
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftlq $19, %k0, %k2
+; CHECK-NEXT:    kshiftlq $18, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $18, %k0, %k3
+; CHECK-NEXT:    kshiftlq $46, %k1, %k1
+; CHECK-NEXT:    kshiftrq $46, %k1, %k1
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftlq $20, %k0, %k2
+; CHECK-NEXT:    kshiftlq $19, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $21, %k0, %k3
+; CHECK-NEXT:    kshiftlq $45, %k1, %k1
+; CHECK-NEXT:    kshiftrq $45, %k1, %k1
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftlq $21, %k0, %k2
+; CHECK-NEXT:    kshiftlq $20, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $20, %k0, %k3
+; CHECK-NEXT:    kshiftlq $44, %k1, %k1
+; CHECK-NEXT:    kshiftrq $44, %k1, %k1
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftlq $22, %k0, %k2
+; CHECK-NEXT:    kshiftlq $21, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $23, %k0, %k3
+; CHECK-NEXT:    kshiftlq $43, %k1, %k1
+; CHECK-NEXT:    kshiftrq $43, %k1, %k1
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftlq $23, %k0, %k2
+; CHECK-NEXT:    kshiftlq $22, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $22, %k0, %k3
+; CHECK-NEXT:    kshiftlq $42, %k1, %k1
+; CHECK-NEXT:    kshiftrq $42, %k1, %k1
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftlq $24, %k0, %k2
+; CHECK-NEXT:    kshiftlq $23, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $25, %k0, %k3
+; CHECK-NEXT:    kshiftlq $41, %k1, %k1
+; CHECK-NEXT:    kshiftrq $41, %k1, %k1
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftlq $25, %k0, %k2
+; CHECK-NEXT:    kshiftlq $24, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $24, %k0, %k3
+; CHECK-NEXT:    kshiftlq $40, %k1, %k1
+; CHECK-NEXT:    kshiftrq $40, %k1, %k1
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftlq $26, %k0, %k2
+; CHECK-NEXT:    kshiftlq $25, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $27, %k0, %k3
+; CHECK-NEXT:    kshiftlq $39, %k1, %k1
+; CHECK-NEXT:    kshiftrq $39, %k1, %k1
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftlq $27, %k0, %k2
+; CHECK-NEXT:    kshiftlq $26, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $26, %k0, %k3
+; CHECK-NEXT:    kshiftlq $38, %k1, %k1
+; CHECK-NEXT:    kshiftrq $38, %k1, %k1
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftlq $28, %k0, %k2
+; CHECK-NEXT:    kshiftlq $27, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $29, %k0, %k3
+; CHECK-NEXT:    kshiftlq $37, %k1, %k1
+; CHECK-NEXT:    kshiftrq $37, %k1, %k1
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftlq $29, %k0, %k2
+; CHECK-NEXT:    kshiftlq $28, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $28, %k0, %k3
+; CHECK-NEXT:    kshiftlq $36, %k1, %k1
+; CHECK-NEXT:    kshiftrq $36, %k1, %k1
+; CHECK-NEXT:    korq %k2, %k1, %k1
+; CHECK-NEXT:    kshiftlq $30, %k0, %k2
+; CHECK-NEXT:    kshiftlq $29, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $31, %k0, %k3
+; CHECK-NEXT:    kshiftlq $35, %k1, %k1
+; CHECK-NEXT:    kshiftrq $35, %k1, %k1
+; CHECK-NEXT:    korq %k2, %k1, %k2
+; CHECK-NEXT:    kshiftlq $31, %k0, %k1
+; CHECK-NEXT:    kshiftlq $30, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k1, %k3
 ; CHECK-NEXT:    vptestnmb %ymm0, %ymm0, %k1
-; CHECK-NEXT:    vpmovm2b %k1, %ymm2
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; CHECK-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
-; CHECK-NEXT:    vpmovb2m %ymm2, %k1
-; CHECK-NEXT:    vpmovm2b %k0, %ymm2
-; CHECK-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
-; CHECK-NEXT:    vpmovb2m %ymm2, %k2
-; CHECK-NEXT:    vmovdqu8 %ymm1, 32(%rsi) {%k2}
-; CHECK-NEXT:    vmovdqu8 %ymm0, (%rsi) {%k1}
+; CHECK-NEXT:    kshiftrd $30, %k0, %k0
+; CHECK-NEXT:    kshiftlq $34, %k2, %k2
+; CHECK-NEXT:    kshiftrq $34, %k2, %k2
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftlq $32, %k0, %k3
+; CHECK-NEXT:    kshiftlq $31, %k0, %k0
+; CHECK-NEXT:    korq %k0, %k3, %k0
+; CHECK-NEXT:    kshiftrd $1, %k1, %k3
+; CHECK-NEXT:    kshiftlq $33, %k2, %k2
+; CHECK-NEXT:    kshiftrq $33, %k2, %k2
+; CHECK-NEXT:    korq %k0, %k2, %k0
+; CHECK-NEXT:    kshiftlq $32, %k0, %k0
+; CHECK-NEXT:    kshiftrq $32, %k0, %k0
+; CHECK-NEXT:    kshiftlq $33, %k0, %k2
+; CHECK-NEXT:    kshiftlq $32, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    korq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $34, %k0, %k2
+; CHECK-NEXT:    kshiftlq $33, %k1, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $3, %k1, %k3
+; CHECK-NEXT:    kshiftlq $31, %k0, %k0
+; CHECK-NEXT:    kshiftrq $31, %k0, %k0
+; CHECK-NEXT:    korq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $35, %k0, %k2
+; CHECK-NEXT:    kshiftlq $34, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $2, %k1, %k3
+; CHECK-NEXT:    kshiftlq $30, %k0, %k0
+; CHECK-NEXT:    kshiftrq $30, %k0, %k0
+; CHECK-NEXT:    korq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $36, %k0, %k2
+; CHECK-NEXT:    kshiftlq $35, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $5, %k1, %k3
+; CHECK-NEXT:    kshiftlq $29, %k0, %k0
+; CHECK-NEXT:    kshiftrq $29, %k0, %k0
+; CHECK-NEXT:    korq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $37, %k0, %k2
+; CHECK-NEXT:    kshiftlq $36, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $4, %k1, %k3
+; CHECK-NEXT:    kshiftlq $28, %k0, %k0
+; CHECK-NEXT:    kshiftrq $28, %k0, %k0
+; CHECK-NEXT:    korq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $38, %k0, %k2
+; CHECK-NEXT:    kshiftlq $37, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $7, %k1, %k3
+; CHECK-NEXT:    kshiftlq $27, %k0, %k0
+; CHECK-NEXT:    kshiftrq $27, %k0, %k0
+; CHECK-NEXT:    korq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $39, %k0, %k2
+; CHECK-NEXT:    kshiftlq $38, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $6, %k1, %k3
+; CHECK-NEXT:    kshiftlq $26, %k0, %k0
+; CHECK-NEXT:    kshiftrq $26, %k0, %k0
+; CHECK-NEXT:    korq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $40, %k0, %k2
+; CHECK-NEXT:    kshiftlq $39, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $9, %k1, %k3
+; CHECK-NEXT:    kshiftlq $25, %k0, %k0
+; CHECK-NEXT:    kshiftrq $25, %k0, %k0
+; CHECK-NEXT:    korq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $41, %k0, %k2
+; CHECK-NEXT:    kshiftlq $40, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $8, %k1, %k3
+; CHECK-NEXT:    kshiftlq $24, %k0, %k0
+; CHECK-NEXT:    kshiftrq $24, %k0, %k0
+; CHECK-NEXT:    korq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $42, %k0, %k2
+; CHECK-NEXT:    kshiftlq $41, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $11, %k1, %k3
+; CHECK-NEXT:    kshiftlq $23, %k0, %k0
+; CHECK-NEXT:    kshiftrq $23, %k0, %k0
+; CHECK-NEXT:    korq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $43, %k0, %k2
+; CHECK-NEXT:    kshiftlq $42, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $10, %k1, %k3
+; CHECK-NEXT:    kshiftlq $22, %k0, %k0
+; CHECK-NEXT:    kshiftrq $22, %k0, %k0
+; CHECK-NEXT:    korq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $44, %k0, %k2
+; CHECK-NEXT:    kshiftlq $43, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $13, %k1, %k3
+; CHECK-NEXT:    kshiftlq $21, %k0, %k0
+; CHECK-NEXT:    kshiftrq $21, %k0, %k0
+; CHECK-NEXT:    korq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $45, %k0, %k2
+; CHECK-NEXT:    kshiftlq $44, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $12, %k1, %k3
+; CHECK-NEXT:    kshiftlq $20, %k0, %k0
+; CHECK-NEXT:    kshiftrq $20, %k0, %k0
+; CHECK-NEXT:    korq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $46, %k0, %k2
+; CHECK-NEXT:    kshiftlq $45, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $15, %k1, %k3
+; CHECK-NEXT:    kshiftlq $19, %k0, %k0
+; CHECK-NEXT:    kshiftrq $19, %k0, %k0
+; CHECK-NEXT:    korq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $47, %k0, %k2
+; CHECK-NEXT:    kshiftlq $46, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $14, %k1, %k3
+; CHECK-NEXT:    kshiftlq $18, %k0, %k0
+; CHECK-NEXT:    kshiftrq $18, %k0, %k0
+; CHECK-NEXT:    korq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $48, %k0, %k2
+; CHECK-NEXT:    kshiftlq $47, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $17, %k1, %k3
+; CHECK-NEXT:    kshiftlq $17, %k0, %k0
+; CHECK-NEXT:    kshiftrq $17, %k0, %k0
+; CHECK-NEXT:    korq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $49, %k0, %k2
+; CHECK-NEXT:    kshiftlq $48, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $16, %k1, %k3
+; CHECK-NEXT:    kshiftlq $16, %k0, %k0
+; CHECK-NEXT:    kshiftrq $16, %k0, %k0
+; CHECK-NEXT:    korq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $50, %k0, %k2
+; CHECK-NEXT:    kshiftlq $49, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $19, %k1, %k3
+; CHECK-NEXT:    kshiftlq $15, %k0, %k0
+; CHECK-NEXT:    kshiftrq $15, %k0, %k0
+; CHECK-NEXT:    korq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $51, %k0, %k2
+; CHECK-NEXT:    kshiftlq $50, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $18, %k1, %k3
+; CHECK-NEXT:    kshiftlq $14, %k0, %k0
+; CHECK-NEXT:    kshiftrq $14, %k0, %k0
+; CHECK-NEXT:    korq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $52, %k0, %k2
+; CHECK-NEXT:    kshiftlq $51, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $21, %k1, %k3
+; CHECK-NEXT:    kshiftlq $13, %k0, %k0
+; CHECK-NEXT:    kshiftrq $13, %k0, %k0
+; CHECK-NEXT:    korq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $53, %k0, %k2
+; CHECK-NEXT:    kshiftlq $52, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $20, %k1, %k3
+; CHECK-NEXT:    kshiftlq $12, %k0, %k0
+; CHECK-NEXT:    kshiftrq $12, %k0, %k0
+; CHECK-NEXT:    korq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $54, %k0, %k2
+; CHECK-NEXT:    kshiftlq $53, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $23, %k1, %k3
+; CHECK-NEXT:    kshiftlq $11, %k0, %k0
+; CHECK-NEXT:    kshiftrq $11, %k0, %k0
+; CHECK-NEXT:    korq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $55, %k0, %k2
+; CHECK-NEXT:    kshiftlq $54, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $22, %k1, %k3
+; CHECK-NEXT:    kshiftlq $10, %k0, %k0
+; CHECK-NEXT:    kshiftrq $10, %k0, %k0
+; CHECK-NEXT:    korq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $56, %k0, %k2
+; CHECK-NEXT:    kshiftlq $55, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $25, %k1, %k3
+; CHECK-NEXT:    kshiftlq $9, %k0, %k0
+; CHECK-NEXT:    kshiftrq $9, %k0, %k0
+; CHECK-NEXT:    korq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $57, %k0, %k2
+; CHECK-NEXT:    kshiftlq $56, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $24, %k1, %k3
+; CHECK-NEXT:    kshiftlq $8, %k0, %k0
+; CHECK-NEXT:    kshiftrq $8, %k0, %k0
+; CHECK-NEXT:    korq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $58, %k0, %k2
+; CHECK-NEXT:    kshiftlq $57, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $27, %k1, %k3
+; CHECK-NEXT:    kshiftlq $7, %k0, %k0
+; CHECK-NEXT:    kshiftrq $7, %k0, %k0
+; CHECK-NEXT:    korq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $59, %k0, %k2
+; CHECK-NEXT:    kshiftlq $58, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $26, %k1, %k3
+; CHECK-NEXT:    kshiftlq $6, %k0, %k0
+; CHECK-NEXT:    kshiftrq $6, %k0, %k0
+; CHECK-NEXT:    korq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $60, %k0, %k2
+; CHECK-NEXT:    kshiftlq $59, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $29, %k1, %k3
+; CHECK-NEXT:    kshiftlq $5, %k0, %k0
+; CHECK-NEXT:    kshiftrq $5, %k0, %k0
+; CHECK-NEXT:    korq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $61, %k0, %k2
+; CHECK-NEXT:    kshiftlq $60, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $28, %k1, %k3
+; CHECK-NEXT:    kshiftlq $4, %k0, %k0
+; CHECK-NEXT:    kshiftrq $4, %k0, %k0
+; CHECK-NEXT:    korq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $62, %k0, %k2
+; CHECK-NEXT:    kshiftlq $61, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftrd $31, %k1, %k3
+; CHECK-NEXT:    kshiftlq $3, %k0, %k0
+; CHECK-NEXT:    kshiftrq $3, %k0, %k0
+; CHECK-NEXT:    korq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftlq $63, %k0, %k2
+; CHECK-NEXT:    kshiftlq $62, %k3, %k3
+; CHECK-NEXT:    korq %k3, %k2, %k2
+; CHECK-NEXT:    kshiftlq $2, %k0, %k0
+; CHECK-NEXT:    kshiftrq $2, %k0, %k0
+; CHECK-NEXT:    korq %k2, %k0, %k0
+; CHECK-NEXT:    kshiftrd $30, %k1, %k1
+; CHECK-NEXT:    kshiftlq $1, %k0, %k0
+; CHECK-NEXT:    kshiftrq $1, %k0, %k0
+; CHECK-NEXT:    kshiftlq $63, %k1, %k1
+; CHECK-NEXT:    korq %k1, %k0, %k1
+; CHECK-NEXT:    vmovdqu8 %ymm1, (%rsi) {%k1}
+; CHECK-NEXT:    kshiftrq $32, %k1, %k1
+; CHECK-NEXT:    vmovdqu8 %ymm0, 32(%rsi) {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
 entry:
@@ -1151,3 +2025,21 @@ entry:
 }
 declare void @llvm.masked.store.v64i8.p0v64i8(<64 x i8>, <64 x i8>*, i32, <64 x i1>)
 
+ at mem64_dst = global i64 0, align 8
+ at mem64_src = global i64 0, align 8
+define i32 @v64i1_inline_asm() "min-legal-vector-width"="256" {
+; CHECK-LABEL: v64i1_inline_asm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovq {{.*}}(%rip), %k0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovq %k0, {{.*}}(%rip)
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    retq
+  %1 = alloca i32, align 4
+  %2 = load i64, i64* @mem64_src, align 8
+  %3 = call i64 asm "", "=k,k,~{dirflag},~{fpsr},~{flags}"(i64 %2)
+  store i64 %3, i64* @mem64_dst, align 8
+  %4 = load i32, i32* %1, align 4
+  ret i32 %4
+}


        


More information about the llvm-commits mailing list