[llvm] b8cdc26 - [DAG] visitCTPOP - if only the upper half of the ctpop operand is zero then see if its profitable to only count the lower half. (#80473)
    via llvm-commits 
    llvm-commits at lists.llvm.org
       
    Tue Feb  6 04:19:36 PST 2024
    
    
  
Author: Simon Pilgrim
Date: 2024-02-06T12:19:31Z
New Revision: b8cdc2638e4c067fd633b345aba75fee81c4054f
URL: https://github.com/llvm/llvm-project/commit/b8cdc2638e4c067fd633b345aba75fee81c4054f
DIFF: https://github.com/llvm/llvm-project/commit/b8cdc2638e4c067fd633b345aba75fee81c4054f.diff
LOG: [DAG] visitCTPOP - if only the upper half of the ctpop operand is zero then see if its profitable to only count the lower half. (#80473)
Added: 
    
Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/test/CodeGen/AMDGPU/ctpop64.ll
    llvm/test/CodeGen/X86/ctpop-mask.ll
Removed: 
    
################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 7f91de12e10d02..291a085b485f95 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11142,11 +11142,29 @@ SDValue DAGCombiner::visitCTTZ_ZERO_UNDEF(SDNode *N) {
 SDValue DAGCombiner::visitCTPOP(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
+  unsigned NumBits = VT.getScalarSizeInBits();
   SDLoc DL(N);
 
   // fold (ctpop c1) -> c2
   if (SDValue C = DAG.FoldConstantArithmetic(ISD::CTPOP, DL, VT, {N0}))
     return C;
+
+  // If the upper bits are known to be zero, then see if its profitable to
+  // only count the lower bits.
+  if (VT.isScalarInteger() && NumBits > 8 && (NumBits & 1) == 0) {
+    EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), NumBits / 2);
+    if (hasOperation(ISD::CTPOP, HalfVT) &&
+        TLI.isTypeDesirableForOp(ISD::CTPOP, HalfVT) &&
+        TLI.isTruncateFree(N0, HalfVT) && TLI.isZExtFree(HalfVT, VT)) {
+      APInt UpperBits = APInt::getHighBitsSet(NumBits, NumBits / 2);
+      if (DAG.MaskedValueIsZero(N0, UpperBits)) {
+        SDValue PopCnt = DAG.getNode(ISD::CTPOP, DL, HalfVT,
+                                     DAG.getZExtOrTrunc(N0, DL, HalfVT));
+        return DAG.getZExtOrTrunc(PopCnt, DL, VT);
+      }
+    }
+  }
+
   return SDValue();
 }
 
diff  --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
index 1346678e51e3dd..3b9c3e3ba17523 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
@@ -452,12 +452,11 @@ define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val)
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_and_b32 s4, s8, 0xff
 ; SI-NEXT:    s_mov_b32 s1, s5
-; SI-NEXT:    s_and_b32 s4, s8, 1
-; SI-NEXT:    s_mov_b32 s5, 0
-; SI-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; SI-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; SI-NEXT:    s_add_i32 s4, s6, s4
+; SI-NEXT:    s_bcnt1_i32_b32 s4, s4
+; SI-NEXT:    s_bcnt1_i32_b64 s5, s[6:7]
+; SI-NEXT:    s_add_i32 s4, s5, s4
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -470,12 +469,11 @@ define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val)
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_and_b32 s4, s8, 0xff
 ; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    s_and_b32 s4, s8, 1
-; VI-NEXT:    s_mov_b32 s5, 0
-; VI-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; VI-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; VI-NEXT:    s_add_i32 s4, s6, s4
+; VI-NEXT:    s_bcnt1_i32_b32 s4, s4
+; VI-NEXT:    s_bcnt1_i32_b64 s5, s[6:7]
+; VI-NEXT:    s_add_i32 s4, s5, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
diff  --git a/llvm/test/CodeGen/X86/ctpop-mask.ll b/llvm/test/CodeGen/X86/ctpop-mask.ll
index abbcf22f77e433..e0a96a9f988791 100644
--- a/llvm/test/CodeGen/X86/ctpop-mask.ll
+++ b/llvm/test/CodeGen/X86/ctpop-mask.ll
@@ -25,7 +25,7 @@ define i64 @ctpop_mask2(i64 %x) nounwind readnone {
 ; X64-POPCOUNT-LABEL: ctpop_mask2:
 ; X64-POPCOUNT:       # %bb.0:
 ; X64-POPCOUNT-NEXT:    andl $3, %edi
-; X64-POPCOUNT-NEXT:    popcntq %rdi, %rax
+; X64-POPCOUNT-NEXT:    popcntl %edi, %eax
 ; X64-POPCOUNT-NEXT:    retq
 ;
 ; X86-NO-POPCOUNT-LABEL: ctpop_mask2:
@@ -189,7 +189,7 @@ define i64 @ctpop_mask4(i64 %x) nounwind readnone {
 ; X64-POPCOUNT-LABEL: ctpop_mask4:
 ; X64-POPCOUNT:       # %bb.0:
 ; X64-POPCOUNT-NEXT:    andl $15, %edi
-; X64-POPCOUNT-NEXT:    popcntq %rdi, %rax
+; X64-POPCOUNT-NEXT:    popcntl %edi, %eax
 ; X64-POPCOUNT-NEXT:    retq
 ;
 ; X86-NO-POPCOUNT-LABEL: ctpop_mask4:
@@ -271,7 +271,7 @@ define i64 @ctpop_mask5(i64 %x) nounwind readnone {
 ; X64-POPCOUNT-LABEL: ctpop_mask5:
 ; X64-POPCOUNT:       # %bb.0:
 ; X64-POPCOUNT-NEXT:    andl $31, %edi
-; X64-POPCOUNT-NEXT:    popcntq %rdi, %rax
+; X64-POPCOUNT-NEXT:    popcntl %edi, %eax
 ; X64-POPCOUNT-NEXT:    retq
 ;
 ; X86-NO-POPCOUNT-LABEL: ctpop_mask5:
@@ -392,7 +392,7 @@ define i64 @ctpop_shifted_mask6(i64 %x) nounwind readnone {
 ; X64-POPCOUNT-LABEL: ctpop_shifted_mask6:
 ; X64-POPCOUNT:       # %bb.0:
 ; X64-POPCOUNT-NEXT:    andl $26112, %edi # imm = 0x6600
-; X64-POPCOUNT-NEXT:    popcntq %rdi, %rax
+; X64-POPCOUNT-NEXT:    popcntl %edi, %eax
 ; X64-POPCOUNT-NEXT:    retq
 ;
 ; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask6:
@@ -556,7 +556,7 @@ define i64 @ctpop_shifted_mask8(i64 %x) nounwind readnone {
 ; X64-POPCOUNT-LABEL: ctpop_shifted_mask8:
 ; X64-POPCOUNT:       # %bb.0:
 ; X64-POPCOUNT-NEXT:    andl $65280, %edi # imm = 0xFF00
-; X64-POPCOUNT-NEXT:    popcntq %rdi, %rax
+; X64-POPCOUNT-NEXT:    popcntl %edi, %eax
 ; X64-POPCOUNT-NEXT:    retq
 ;
 ; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask8:
        
    
    
More information about the llvm-commits
mailing list