[llvm] r318871 - [X86] Support v32i16/v64i8 CTLZ using lookup table.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 22 12:05:57 PST 2017


Author: ctopper
Date: Wed Nov 22 12:05:57 2017
New Revision: 318871

URL: http://llvm.org/viewvc/llvm-project?rev=318871&view=rev
Log:
[X86] Support v32i16/v64i8 CTLZ using lookup table.

Had to tweak the setcc's used by the code to use a vXi1 result type with a sign extend back to vector size.

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/vector-lzcnt-512.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=318871&r1=318870&r2=318871&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Nov 22 12:05:57 2017
@@ -1493,11 +1493,6 @@ X86TargetLowering::X86TargetLowering(con
       setOperationAction(ISD::MSTORE,              VT, Action);
     }
 
-    if (Subtarget.hasCDI()) {
-      setOperationAction(ISD::CTLZ,            MVT::v32i16, Custom);
-      setOperationAction(ISD::CTLZ,            MVT::v64i8,  Custom);
-    }
-
     for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
       setOperationAction(ISD::VSELECT,      VT, Custom);
@@ -1509,6 +1504,7 @@ X86TargetLowering::X86TargetLowering(con
       setOperationAction(ISD::MSTORE,       VT, Legal);
       setOperationAction(ISD::CTPOP,        VT, Custom);
       setOperationAction(ISD::CTTZ,         VT, Custom);
+      setOperationAction(ISD::CTLZ,         VT, Custom);
       setOperationAction(ISD::SMAX,         VT, Legal);
       setOperationAction(ISD::UMAX,         VT, Legal);
       setOperationAction(ISD::SMIN,         VT, Legal);
@@ -21677,7 +21673,14 @@ static SDValue LowerVectorCTLZInRegLUT(S
   SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
   SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
   SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
-  SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
+  SDValue HiZ;
+  if (CurrVT.is512BitVector()) {
+    MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
+    HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
+    HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
+  } else {
+    HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
+  }
 
   Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
   Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
@@ -21697,8 +21700,15 @@ static SDValue LowerVectorCTLZInRegLUT(S
     SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
 
     // Check if the upper half of the input element is zero.
-    SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
-                               DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
+    if (CurrVT.is512BitVector()) {
+      MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
+      HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
+                         DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
+      HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
+    } else {
+      HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
+                         DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
+    }
     HiZ = DAG.getBitcast(NextVT, HiZ);
 
     // Move the upper/lower halves to the lower bits as we'll be extending to

Modified: llvm/trunk/test/CodeGen/X86/vector-lzcnt-512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-lzcnt-512.ll?rev=318871&r1=318870&r2=318871&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-lzcnt-512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-lzcnt-512.ll Wed Nov 22 12:05:57 2017
@@ -367,27 +367,24 @@ define <32 x i16> @testv32i16(<32 x i16>
 ;
 ; AVX512BW-LABEL: testv32i16:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vpsrlw $1, %zmm0, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpsrlw $2, %zmm0, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4
-; AVX512BW-NEXT:    vpxorq %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
-; AVX512BW-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpsllw $8, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm4
+; AVX512BW-NEXT:    vptestnmb %zmm1, %zmm4, %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm5
+; AVX512BW-NEXT:    vpandq %zmm5, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm4, %zmm1
+; AVX512BW-NEXT:    vpshufb %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpcmpeqb %zmm2, %zmm0, %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: testv32i16:
@@ -457,27 +454,24 @@ define <32 x i16> @testv32i16u(<32 x i16
 ;
 ; AVX512BW-LABEL: testv32i16u:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vpsrlw $1, %zmm0, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpsrlw $2, %zmm0, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4
-; AVX512BW-NEXT:    vpxorq %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
-; AVX512BW-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpsllw $8, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm4
+; AVX512BW-NEXT:    vptestnmb %zmm1, %zmm4, %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm5
+; AVX512BW-NEXT:    vpandq %zmm5, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm4, %zmm1
+; AVX512BW-NEXT:    vpshufb %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpcmpeqb %zmm2, %zmm0, %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: testv32i16u:
@@ -571,25 +565,17 @@ define <64 x i8> @testv64i8(<64 x i8> %i
 ;
 ; AVX512BW-LABEL: testv64i8:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vpsrlw $1, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpsrlw $2, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpandq %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpandnq %zmm2, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512BW-NEXT:    vpshufb %zmm1, %zmm3, %zmm1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4
-; AVX512BW-NEXT:    vpxorq %zmm4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vptestnmb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm4
+; AVX512BW-NEXT:    vpandq %zmm4, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
-; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: testv64i8:
@@ -673,25 +659,17 @@ define <64 x i8> @testv64i8u(<64 x i8> %
 ;
 ; AVX512BW-LABEL: testv64i8u:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vpsrlw $1, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpsrlw $2, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpandq %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpandnq %zmm2, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512BW-NEXT:    vpshufb %zmm1, %zmm3, %zmm1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4
-; AVX512BW-NEXT:    vpxorq %zmm4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vptestnmb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm4
+; AVX512BW-NEXT:    vpandq %zmm4, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
-; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: testv64i8u:




More information about the llvm-commits mailing list