[llvm] r318871 - [X86] Support v32i16/v64i8 CTLZ using lookup table.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 22 12:05:57 PST 2017
Author: ctopper
Date: Wed Nov 22 12:05:57 2017
New Revision: 318871
URL: http://llvm.org/viewvc/llvm-project?rev=318871&view=rev
Log:
[X86] Support v32i16/v64i8 CTLZ using lookup table.
Had to tweak the setcc's used by the code to use a vXi1 result type with a sign extend back to vector size.
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/vector-lzcnt-512.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=318871&r1=318870&r2=318871&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Nov 22 12:05:57 2017
@@ -1493,11 +1493,6 @@ X86TargetLowering::X86TargetLowering(con
setOperationAction(ISD::MSTORE, VT, Action);
}
- if (Subtarget.hasCDI()) {
- setOperationAction(ISD::CTLZ, MVT::v32i16, Custom);
- setOperationAction(ISD::CTLZ, MVT::v64i8, Custom);
- }
-
for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Custom);
@@ -1509,6 +1504,7 @@ X86TargetLowering::X86TargetLowering(con
setOperationAction(ISD::MSTORE, VT, Legal);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::CTTZ, VT, Custom);
+ setOperationAction(ISD::CTLZ, VT, Custom);
setOperationAction(ISD::SMAX, VT, Legal);
setOperationAction(ISD::UMAX, VT, Legal);
setOperationAction(ISD::SMIN, VT, Legal);
@@ -21677,7 +21673,14 @@ static SDValue LowerVectorCTLZInRegLUT(S
SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
- SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
+ SDValue HiZ;
+ if (CurrVT.is512BitVector()) {
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
+ HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
+ HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
+ } else {
+ HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
+ }
Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
@@ -21697,8 +21700,15 @@ static SDValue LowerVectorCTLZInRegLUT(S
SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
// Check if the upper half of the input element is zero.
- SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
- DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
+ if (CurrVT.is512BitVector()) {
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
+ HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
+ DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
+ HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
+ } else {
+ HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
+ DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
+ }
HiZ = DAG.getBitcast(NextVT, HiZ);
// Move the upper/lower halves to the lower bits as we'll be extending to
Modified: llvm/trunk/test/CodeGen/X86/vector-lzcnt-512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-lzcnt-512.ll?rev=318871&r1=318870&r2=318871&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-lzcnt-512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-lzcnt-512.ll Wed Nov 22 12:05:57 2017
@@ -367,27 +367,24 @@ define <32 x i16> @testv32i16(<32 x i16>
;
; AVX512BW-LABEL: testv32i16:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4
-; AVX512BW-NEXT: vpxorq %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
-; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1
-; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm4
+; AVX512BW-NEXT: vptestnmb %zmm1, %zmm4, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm5
+; AVX512BW-NEXT: vpandq %zmm5, %zmm2, %zmm2
+; AVX512BW-NEXT: vpandq %zmm1, %zmm4, %zmm1
+; AVX512BW-NEXT: vpshufb %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT: vpcmpeqb %zmm2, %zmm0, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: testv32i16:
@@ -457,27 +454,24 @@ define <32 x i16> @testv32i16u(<32 x i16
;
; AVX512BW-LABEL: testv32i16u:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4
-; AVX512BW-NEXT: vpxorq %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
-; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1
-; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm4
+; AVX512BW-NEXT: vptestnmb %zmm1, %zmm4, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm5
+; AVX512BW-NEXT: vpandq %zmm5, %zmm2, %zmm2
+; AVX512BW-NEXT: vpandq %zmm1, %zmm4, %zmm1
+; AVX512BW-NEXT: vpshufb %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT: vpcmpeqb %zmm2, %zmm0, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: testv32i16u:
@@ -571,25 +565,17 @@ define <64 x i8> @testv64i8(<64 x i8> %i
;
; AVX512BW-LABEL: testv64i8:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm1
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpandnq %zmm2, %zmm0, %zmm1
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512BW-NEXT: vpshufb %zmm1, %zmm3, %zmm1
-; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4
-; AVX512BW-NEXT: vpxorq %zmm4, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vptestnmb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm4
+; AVX512BW-NEXT: vpandq %zmm4, %zmm2, %zmm2
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
-; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: testv64i8:
@@ -673,25 +659,17 @@ define <64 x i8> @testv64i8u(<64 x i8> %
;
; AVX512BW-LABEL: testv64i8u:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm1
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpandnq %zmm2, %zmm0, %zmm1
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512BW-NEXT: vpshufb %zmm1, %zmm3, %zmm1
-; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4
-; AVX512BW-NEXT: vpxorq %zmm4, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vptestnmb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm4
+; AVX512BW-NEXT: vpandq %zmm4, %zmm2, %zmm2
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
-; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: testv64i8u:
More information about the llvm-commits
mailing list