[llvm] 2af1640 - [LegalizeDAG][X86][AMDGPU] Use ANY_EXTEND instead of ZERO_EXTEND when promoting ISD::CTTZ/CTTZ_ZERO_UNDEF.

Fri Feb 7 22:26:50 PST 2020

Author: Craig Topper
Date: 2020-02-07T22:25:56-08:00
New Revision: 2af1640f9aa4ebe5d447586bbdad6514312bb814

URL: https://github.com/llvm/llvm-project/commit/2af1640f9aa4ebe5d447586bbdad6514312bb814
DIFF: https://github.com/llvm/llvm-project/commit/2af1640f9aa4ebe5d447586bbdad6514312bb814.diff

LOG: [LegalizeDAG][X86][AMDGPU] Use ANY_EXTEND instead of ZERO_EXTEND when promoting ISD::CTTZ/CTTZ_ZERO_UNDEF.

Summary:
For CTTZ we place a set bit just past where the non-promoted type
stopped so the extended bits won't be used for the count. For
CTTZ_ZERO_UNDEF we don't care what happens if no bits are set in
the original type and we end up counting into the extended bits.
So we can just use ANY_EXTEND for both cases.

This matches what is done in type legalization for these operations.
We make no effort to force the upper bits to zero.

Differential Revision: https://reviews.llvm.org/D74111

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
    llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
    llvm/test/CodeGen/X86/clz.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 5a23573d033c..d03f765a5779 100644

--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -4342,8 +4342,13 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
   case ISD::CTLZ:
   case ISD::CTLZ_ZERO_UNDEF:
   case ISD::CTPOP:
-    // Zero extend the argument.
-    Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Node->getOperand(0));
+    // Zero extend the argument unless its cttz, then use any_extend.
+    if (Node->getOpcode() == ISD::CTTZ ||
+        Node->getOpcode() == ISD::CTTZ_ZERO_UNDEF)
+      Tmp1 = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Node->getOperand(0));
+    else
+      Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Node->getOperand(0));
+
     if (Node->getOpcode() == ISD::CTTZ) {
       // The count is the same in the promoted type except if the original
       // value was zero.  This can be handled by setting the bit just off

diff  --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index c5e7a7310e63..626ef156a91e 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -131,7 +131,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(i64 addrspace(1)* n
 
 ; FUNC-LABEL: {{^}}v_cttz_zero_undef_i8_with_select:
 ; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
-; SI-SDWA: v_ffbl_b32_sdwa
+; SI-SDWA: v_ffbl_b32_e32
 ; EG: MEM_RAT MSKOR
 define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(i8 addrspace(1)* noalias %out, i8 addrspace(1)* nocapture readonly %arrayidx) nounwind {
   %val = load i8, i8 addrspace(1)* %arrayidx, align 1
@@ -144,7 +144,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(i8 addrspace(1)* noa
 
 ; FUNC-LABEL: {{^}}v_cttz_zero_undef_i16_with_select:
 ; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
-; SI-SDWA: v_ffbl_b32_sdwa
+; SI-SDWA: v_ffbl_b32_e32
 ; EG: MEM_RAT MSKOR
 define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(i16 addrspace(1)* noalias %out, i16 addrspace(1)* nocapture readonly %arrayidx) nounwind {
   %val = load i16, i16 addrspace(1)* %arrayidx, align 1
@@ -246,7 +246,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
 ; FUNC-LABEL: {{^}}v_cttz_i8_sel_eq_neg1:
 ; SI: {{buffer|flat}}_load_ubyte
 ; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
-; SI-SDWA: v_ffbl_b32_sdwa
+; SI-SDWA: v_ffbl_b32_e32
 ; EG: MEM_RAT MSKOR
 ; EG: FFBL_INT
  define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* nocapture readonly %arrayidx) nounwind {

diff  --git a/llvm/test/CodeGen/X86/clz.ll b/llvm/test/CodeGen/X86/clz.ll
index f76f5b10151c..7884f3ebcc0c 100644
--- a/llvm/test/CodeGen/X86/clz.ll
+++ b/llvm/test/CodeGen/X86/clz.ll
@@ -17,29 +17,25 @@ declare i64 @llvm.ctlz.i64(i64, i1)
 define i8 @cttz_i8(i8 %x)  {
 ; X32-LABEL: cttz_i8:
 ; X32:       # %bb.0:
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    bsfl %eax, %eax
+; X32-NEXT:    bsfl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    # kill: def $al killed $al killed $eax
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: cttz_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    bsfl %eax, %eax
+; X64-NEXT:    bsfl %edi, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
 ; X32-CLZ-LABEL: cttz_i8:
 ; X32-CLZ:       # %bb.0:
-; X32-CLZ-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-CLZ-NEXT:    tzcntl %eax, %eax
+; X32-CLZ-NEXT:    tzcntl {{[0-9]+}}(%esp), %eax
 ; X32-CLZ-NEXT:    # kill: def $al killed $al killed $eax
 ; X32-CLZ-NEXT:    retl
 ;
 ; X64-CLZ-LABEL: cttz_i8:
 ; X64-CLZ:       # %bb.0:
-; X64-CLZ-NEXT:    movzbl %dil, %eax
-; X64-CLZ-NEXT:    tzcntl %eax, %eax
+; X64-CLZ-NEXT:    tzcntl %edi, %eax
 ; X64-CLZ-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-CLZ-NEXT:    retq
   %tmp = call i8 @llvm.cttz.i8( i8 %x, i1 true )
@@ -503,17 +499,16 @@ define i8 @cttz_i8_zero_test(i8 %n) {
 ;
 ; X32-CLZ-LABEL: cttz_i8_zero_test:
 ; X32-CLZ:       # %bb.0:
-; X32-CLZ-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-CLZ-NEXT:    orl $256, %eax # imm = 0x100
+; X32-CLZ-NEXT:    movl $256, %eax # imm = 0x100
+; X32-CLZ-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X32-CLZ-NEXT:    tzcntl %eax, %eax
 ; X32-CLZ-NEXT:    # kill: def $al killed $al killed $eax
 ; X32-CLZ-NEXT:    retl
 ;
 ; X64-CLZ-LABEL: cttz_i8_zero_test:
 ; X64-CLZ:       # %bb.0:
-; X64-CLZ-NEXT:    movzbl %dil, %eax
-; X64-CLZ-NEXT:    orl $256, %eax # imm = 0x100
-; X64-CLZ-NEXT:    tzcntl %eax, %eax
+; X64-CLZ-NEXT:    orl $256, %edi # imm = 0x100
+; X64-CLZ-NEXT:    tzcntl %edi, %eax
 ; X64-CLZ-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-CLZ-NEXT:    retq
   %tmp1 = call i8 @llvm.cttz.i8(i8 %n, i1 false)