[llvm] r265283 - AVX-512: Truncating store for i1 vectors

Elena Demikhovsky via llvm-commits llvm-commits at lists.llvm.org
Mon Apr 4 00:17:48 PDT 2016


Author: delena
Date: Mon Apr  4 02:17:47 2016
New Revision: 265283

URL: http://llvm.org/viewvc/llvm-project?rev=265283&view=rev
Log:
AVX-512: Truncating store for i1 vectors
Implemented truncstore for KNL and skylake-avx512.
Covered vectors from v2i1 to v64i1. We save the value in bits (not in bytes) - v32i1 is saved in 4 bytes.

Differential Revision: http://reviews.llvm.org/D18740


Modified:
    llvm/trunk/include/llvm/Target/TargetLowering.h
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
    llvm/trunk/test/CodeGen/X86/vector-compare-results.ll

Modified: llvm/trunk/include/llvm/Target/TargetLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Target/TargetLowering.h?rev=265283&r1=265282&r2=265283&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Target/TargetLowering.h (original)
+++ llvm/trunk/include/llvm/Target/TargetLowering.h Mon Apr  4 02:17:47 2016
@@ -637,6 +637,14 @@ public:
       getTruncStoreAction(ValVT.getSimpleVT(), MemVT.getSimpleVT()) == Legal;
   }
 
+  /// Return true if the specified store with truncation has solution on this
+  /// target.
+  bool isTruncStoreLegalOrCustom(EVT ValVT, EVT MemVT) const {
+    return isTypeLegal(ValVT) && MemVT.isSimple() &&
+      (getTruncStoreAction(ValVT.getSimpleVT(), MemVT.getSimpleVT()) == Legal ||
+       getTruncStoreAction(ValVT.getSimpleVT(), MemVT.getSimpleVT()) == Custom);
+  }
+
   /// Return how the indexed load should be treated: either it is legal, needs
   /// to be promoted to a larger size, needs to be expanded to some other code
   /// sequence, or the target has a custom expander for it.

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=265283&r1=265282&r2=265283&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon Apr  4 02:17:47 2016
@@ -1394,6 +1394,7 @@ X86TargetLowering::X86TargetLowering(con
       setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
       setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
       setLoadExtAction(ISD::EXTLOAD,  VT, MaskVT, Custom);
+      setTruncStoreAction(VT, MaskVT, Custom);
     }
     setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
     setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
@@ -16106,6 +16107,65 @@ static SDValue LowerSIGN_EXTEND(SDValue
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
 }
 
+// Lower truncating store. We need a special lowering to vXi1 vectors
+static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
+                                    SelectionDAG &DAG) {
+  StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
+  SDLoc dl(St);
+  EVT MemVT = St->getMemoryVT();
+  assert(St->isTruncatingStore() && "We only custom truncating store.");
+  assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
+         "Expected truncstore of i1 vector");
+
+  SDValue Op = St->getValue();
+  MVT OpVT = Op.getValueType().getSimpleVT();
+  unsigned NumElts = OpVT.getVectorNumElements();
+  if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
+      NumElts == 16) {
+    // Truncate and store - everything is legal
+    Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
+    if (MemVT.getSizeInBits() < 8)
+      Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
+                       DAG.getUNDEF(MVT::v8i1), Op,
+                       DAG.getIntPtrConstant(0, dl));
+    return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
+                        St->getMemOperand());
+  }
+
+  // A subset, assume that we have only AVX-512F
+  if (NumElts <= 8) {
+    if (NumElts < 8) {
+      // Extend to 8-elts vector
+      MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
+      Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
+                        DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
+    }
+    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
+    return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
+                        St->getMemOperand());
+  }
+  // v32i8
+  assert(OpVT == MVT::v32i8 && "Unexpected operand type");
+  // Divide the vector into 2 parts and store each part separately
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
+                            DAG.getIntPtrConstant(0, dl));
+  Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
+  SDValue BasePtr = St->getBasePtr();
+  SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
+                              St->getMemOperand());
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
+                            DAG.getIntPtrConstant(16, dl));
+  Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
+
+  SDValue BasePtrHi =
+    DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+                DAG.getConstant(2, dl, BasePtr.getValueType()));
+
+  SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
+                              BasePtrHi, St->getMemOperand());
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
+}
+
 static SDValue LowerExtended1BitVectorLoad(SDValue Op,
                                            const X86Subtarget &Subtarget,
                                            SelectionDAG &DAG) {
@@ -21444,6 +21504,7 @@ SDValue X86TargetLowering::LowerOperatio
   case ISD::GC_TRANSITION_START:
                                 return LowerGC_TRANSITION_START(Op, DAG);
   case ISD::GC_TRANSITION_END:  return LowerGC_TRANSITION_END(Op, DAG);
+  case ISD::STORE:              return LowerTruncatingStore(Op, Subtarget, DAG);
   }
 }
 
@@ -28021,7 +28082,7 @@ static SDValue combineStore(SDNode *N, S
     // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
     // are designated for truncate store.
     // In this case we don't need any further transformations.
-    if (TLI.isTruncStoreLegal(VT, StVT))
+    if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
       return SDValue();
 
     // From, To sizes and ElemCount must be pow of two

Modified: llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll?rev=265283&r1=265282&r2=265283&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll Mon Apr  4 02:17:47 2016
@@ -521,17 +521,9 @@ define <32 x i16> @test21(<32 x i16> %x
 define void @test22(<4 x i1> %a, <4 x i1>* %addr) {
 ; KNL-LABEL: test22:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    vpextrd $3, %xmm0, %eax
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    movb %al, (%rdi)
-; KNL-NEXT:    vpextrd $2, %xmm0, %eax
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    movb %al, (%rdi)
-; KNL-NEXT:    vpextrd $1, %xmm0, %eax
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    movb %al, (%rdi)
-; KNL-NEXT:    vmovd %xmm0, %eax
-; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    vpslld $31, %ymm0, %ymm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    movb %al, (%rdi)
 ; KNL-NEXT:    retq
 ;
@@ -548,11 +540,9 @@ define void @test22(<4 x i1> %a, <4 x i1
 define void @test23(<2 x i1> %a, <2 x i1>* %addr) {
 ; KNL-LABEL: test23:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    movb %al, (%rdi)
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    movb %al, (%rdi)
 ; KNL-NEXT:    retq
 ;
@@ -596,11 +586,9 @@ define void @store_v2i1(<2 x i1> %c , <2
 ; KNL-LABEL: store_v2i1:
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    movb %al, (%rdi)
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    movb %al, (%rdi)
 ; KNL-NEXT:    retq
 ;
@@ -621,17 +609,9 @@ define void @store_v4i1(<4 x i1> %c , <4
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
 ; KNL-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; KNL-NEXT:    vpextrd $3, %xmm0, %eax
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    movb %al, (%rdi)
-; KNL-NEXT:    vpextrd $2, %xmm0, %eax
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    movb %al, (%rdi)
-; KNL-NEXT:    vpextrd $1, %xmm0, %eax
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    movb %al, (%rdi)
-; KNL-NEXT:    vmovd %xmm0, %eax
-; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    vpslld $31, %ymm0, %ymm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    movb %al, (%rdi)
 ; KNL-NEXT:    retq
 ;
@@ -1038,3 +1018,128 @@ define <64 x i8> @load_64i1(<64 x i1>* %
   %c = sext <64 x i1> %b to <64 x i8>
   ret <64 x i8> %c
 }
+
+define void @store_8i1(<8 x i1>* %a, <8 x i1> %v) {
+; KNL-LABEL: store_8i1:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    movb %al, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: store_8i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k0
+; SKX-NEXT:    kmovb %k0, (%rdi)
+; SKX-NEXT:    retq
+  store <8 x i1> %v, <8 x i1>* %a
+  ret void
+}
+
+define void @store_8i1_1(<8 x i1>* %a, <8 x i16> %v) {
+; KNL-LABEL: store_8i1_1:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    movb %al, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: store_8i1_1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k0
+; SKX-NEXT:    kmovb %k0, (%rdi)
+; SKX-NEXT:    retq
+  %v1 = trunc <8 x i16> %v to <8 x i1>
+  store <8 x i1> %v1, <8 x i1>* %a
+  ret void
+}
+
+define void @store_16i1(<16 x i1>* %a, <16 x i1> %v) {
+; KNL-LABEL: store_16i1:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: store_16i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT:    vpmovb2m %xmm0, %k0
+; SKX-NEXT:    kmovw %k0, (%rdi)
+; SKX-NEXT:    retq
+  store <16 x i1> %v, <16 x i1>* %a
+  ret void
+}
+
+define void @store_32i1(<32 x i1>* %a, <32 x i1> %v) {
+; KNL-LABEL: store_32i1:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, 2(%rdi)
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: store_32i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %ymm0, %ymm0
+; SKX-NEXT:    vpmovb2m %ymm0, %k0
+; SKX-NEXT:    kmovd %k0, (%rdi)
+; SKX-NEXT:    retq
+  store <32 x i1> %v, <32 x i1>* %a
+  ret void
+}
+
+define void @store_32i1_1(<32 x i1>* %a, <32 x i16> %v) {
+; KNL-LABEL: store_32i1_1:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    vpmovsxwd %ymm1, %zmm1
+; KNL-NEXT:    vpmovdb %zmm1, %xmm1
+; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, 2(%rdi)
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: store_32i1_1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $15, %zmm0, %zmm0
+; SKX-NEXT:    vpmovw2m %zmm0, %k0
+; SKX-NEXT:    kmovd %k0, (%rdi)
+; SKX-NEXT:    retq
+  %v1 = trunc <32 x i16> %v to <32 x i1>
+  store <32 x i1> %v1, <32 x i1>* %a
+  ret void
+}
+
+
+define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) {
+;
+; SKX-LABEL: store_64i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %zmm0, %zmm0
+; SKX-NEXT:    vpmovb2m %zmm0, %k0
+; SKX-NEXT:    kmovq %k0, (%rdi)
+; SKX-NEXT:    retq
+  store <64 x i1> %v, <64 x i1>* %a
+  ret void
+}

Modified: llvm/trunk/test/CodeGen/X86/vector-compare-results.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-compare-results.ll?rev=265283&r1=265282&r2=265283&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-compare-results.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-compare-results.ll Mon Apr  4 02:17:47 2016
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
@@ -6569,393 +6570,41 @@ define <128 x i1> @test_cmp_v128i8(<128
 ; AVX512-NEXT:    vpcmpgtb %ymm6, %ymm2, %ymm2
 ; AVX512-NEXT:    vpcmpgtb %ymm7, %ymm3, %ymm3
 ; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512-NEXT:    vpextrb $15, %xmm4, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
-; AVX512-NEXT:    vpextrb $14, %xmm4, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
-; AVX512-NEXT:    vpextrb $13, %xmm4, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
-; AVX512-NEXT:    vpextrb $12, %xmm4, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
-; AVX512-NEXT:    vpextrb $11, %xmm4, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
-; AVX512-NEXT:    vpextrb $10, %xmm4, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
-; AVX512-NEXT:    vpextrb $9, %xmm4, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
-; AVX512-NEXT:    vpextrb $8, %xmm4, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
-; AVX512-NEXT:    vpextrb $7, %xmm4, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
-; AVX512-NEXT:    vpextrb $6, %xmm4, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
-; AVX512-NEXT:    vpextrb $5, %xmm4, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
-; AVX512-NEXT:    vpextrb $4, %xmm4, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
-; AVX512-NEXT:    vpextrb $3, %xmm4, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
-; AVX512-NEXT:    vpextrb $2, %xmm4, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
-; AVX512-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
-; AVX512-NEXT:    vpextrb $0, %xmm4, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
-; AVX512-NEXT:    vpextrb $15, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
-; AVX512-NEXT:    vpextrb $14, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
-; AVX512-NEXT:    vpextrb $13, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
-; AVX512-NEXT:    vpextrb $12, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
-; AVX512-NEXT:    vpextrb $11, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
-; AVX512-NEXT:    vpextrb $10, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
-; AVX512-NEXT:    vpextrb $9, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
-; AVX512-NEXT:    vpextrb $8, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
-; AVX512-NEXT:    vpextrb $7, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
-; AVX512-NEXT:    vpextrb $6, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
-; AVX512-NEXT:    vpextrb $5, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
-; AVX512-NEXT:    vpextrb $4, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
-; AVX512-NEXT:    vpextrb $3, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
-; AVX512-NEXT:    vpextrb $2, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
-; AVX512-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
-; AVX512-NEXT:    vpextrb $0, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 12(%rdi)
+; AVX512-NEXT:    vpmovsxbd %xmm4, %zmm4
+; AVX512-NEXT:    vpslld $31, %zmm4, %zmm4
+; AVX512-NEXT:    vptestmd %zmm4, %zmm4, %k0
+; AVX512-NEXT:    kmovw %k0, 14(%rdi)
+; AVX512-NEXT:    vpmovsxbd %xmm3, %zmm3
+; AVX512-NEXT:    vpslld $31, %zmm3, %zmm3
+; AVX512-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; AVX512-NEXT:    kmovw %k0, 12(%rdi)
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-NEXT:    vpextrb $15, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
-; AVX512-NEXT:    vpextrb $14, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
-; AVX512-NEXT:    vpextrb $13, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
-; AVX512-NEXT:    vpextrb $12, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
-; AVX512-NEXT:    vpextrb $11, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
-; AVX512-NEXT:    vpextrb $10, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
-; AVX512-NEXT:    vpextrb $9, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
-; AVX512-NEXT:    vpextrb $8, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
-; AVX512-NEXT:    vpextrb $7, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
-; AVX512-NEXT:    vpextrb $6, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
-; AVX512-NEXT:    vpextrb $5, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
-; AVX512-NEXT:    vpextrb $4, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
-; AVX512-NEXT:    vpextrb $3, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
-; AVX512-NEXT:    vpextrb $2, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
-; AVX512-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
-; AVX512-NEXT:    vpextrb $0, %xmm3, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
-; AVX512-NEXT:    vpextrb $15, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
-; AVX512-NEXT:    vpextrb $14, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
-; AVX512-NEXT:    vpextrb $13, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
-; AVX512-NEXT:    vpextrb $12, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
-; AVX512-NEXT:    vpextrb $11, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
-; AVX512-NEXT:    vpextrb $10, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
-; AVX512-NEXT:    vpextrb $9, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
-; AVX512-NEXT:    vpextrb $8, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
-; AVX512-NEXT:    vpextrb $7, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
-; AVX512-NEXT:    vpextrb $6, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
-; AVX512-NEXT:    vpextrb $5, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
-; AVX512-NEXT:    vpextrb $4, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
-; AVX512-NEXT:    vpextrb $3, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
-; AVX512-NEXT:    vpextrb $2, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
-; AVX512-NEXT:    vpextrb $1, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
-; AVX512-NEXT:    vpextrb $0, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 8(%rdi)
+; AVX512-NEXT:    vpmovsxbd %xmm3, %zmm3
+; AVX512-NEXT:    vpslld $31, %zmm3, %zmm3
+; AVX512-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; AVX512-NEXT:    kmovw %k0, 10(%rdi)
+; AVX512-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512-NEXT:    vpslld $31, %zmm2, %zmm2
+; AVX512-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; AVX512-NEXT:    kmovw %k0, 8(%rdi)
 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT:    vpextrb $15, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
-; AVX512-NEXT:    vpextrb $14, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
-; AVX512-NEXT:    vpextrb $13, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
-; AVX512-NEXT:    vpextrb $12, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
-; AVX512-NEXT:    vpextrb $11, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
-; AVX512-NEXT:    vpextrb $10, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
-; AVX512-NEXT:    vpextrb $9, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
-; AVX512-NEXT:    vpextrb $8, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
-; AVX512-NEXT:    vpextrb $7, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
-; AVX512-NEXT:    vpextrb $6, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
-; AVX512-NEXT:    vpextrb $5, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
-; AVX512-NEXT:    vpextrb $4, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
-; AVX512-NEXT:    vpextrb $3, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
-; AVX512-NEXT:    vpextrb $2, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
-; AVX512-NEXT:    vpextrb $1, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
-; AVX512-NEXT:    vpextrb $0, %xmm2, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
-; AVX512-NEXT:    vpextrb $15, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
-; AVX512-NEXT:    vpextrb $14, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
-; AVX512-NEXT:    vpextrb $13, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
-; AVX512-NEXT:    vpextrb $12, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
-; AVX512-NEXT:    vpextrb $11, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
-; AVX512-NEXT:    vpextrb $10, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
-; AVX512-NEXT:    vpextrb $9, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
-; AVX512-NEXT:    vpextrb $8, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
-; AVX512-NEXT:    vpextrb $7, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
-; AVX512-NEXT:    vpextrb $6, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
-; AVX512-NEXT:    vpextrb $5, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
-; AVX512-NEXT:    vpextrb $4, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
-; AVX512-NEXT:    vpextrb $3, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
-; AVX512-NEXT:    vpextrb $2, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
-; AVX512-NEXT:    vpextrb $1, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
-; AVX512-NEXT:    vpextrb $0, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, 4(%rdi)
+; AVX512-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512-NEXT:    vpslld $31, %zmm2, %zmm2
+; AVX512-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; AVX512-NEXT:    kmovw %k0, 6(%rdi)
+; AVX512-NEXT:    vpmovsxbd %xmm1, %zmm1
+; AVX512-NEXT:    vpslld $31, %zmm1, %zmm1
+; AVX512-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; AVX512-NEXT:    kmovw %k0, 4(%rdi)
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpextrb $15, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    vpextrb $14, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    vpextrb $13, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    vpextrb $12, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    vpextrb $11, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    vpextrb $10, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    vpextrb $9, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    vpextrb $8, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    vpextrb $7, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    vpextrb $6, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    vpextrb $5, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    vpextrb $4, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    vpextrb $3, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    vpextrb $2, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    vpextrb $1, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    vpextrb $0, %xmm1, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    vpextrb $14, %xmm0, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    vpextrb $13, %xmm0, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    vpextrb $10, %xmm0, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    vpextrb $9, %xmm0, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    vpextrb $6, %xmm0, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    vpextrb $5, %xmm0, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    vpextrb $2, %xmm0, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    vpextrb $1, %xmm0, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
-; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX512-NEXT:    andb $1, %al
-; AVX512-NEXT:    movb %al, (%rdi)
+; AVX512-NEXT:    vpmovsxbd %xmm1, %zmm1
+; AVX512-NEXT:    vpslld $31, %zmm1, %zmm1
+; AVX512-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; AVX512-NEXT:    kmovw %k0, 2(%rdi)
+; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512-NEXT:    kmovw %k0, (%rdi)
 ; AVX512-NEXT:    movq %rdi, %rax
 ; AVX512-NEXT:    retq
   %1 = icmp sgt <128 x i8> %a0, %a1




More information about the llvm-commits mailing list