[llvm] [DAG] visitCTPOP - if only the upper half of the ctpop operand is zero then see if its profitable to only count the lower half. (PR #80473)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Mon Feb 5 05:52:46 PST 2024


https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/80473

>From 873365f02e239216512f33536762452fde45ab5d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Fri, 2 Feb 2024 18:16:27 +0000
Subject: [PATCH] [DAG] visitCTPOP - if only the upper half of the ctpop
 operand is zero then see if its profitable to only count the lower half.

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 18 ++++++++++++++++++
 llvm/test/CodeGen/AMDGPU/ctpop64.ll           | 18 ++++++++----------
 llvm/test/CodeGen/X86/ctpop-mask.ll           | 10 +++++-----
 3 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 3ce45e0e43bf4..804823af8daf0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11142,11 +11142,29 @@ SDValue DAGCombiner::visitCTTZ_ZERO_UNDEF(SDNode *N) {
 SDValue DAGCombiner::visitCTPOP(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
+  unsigned NumBits = VT.getScalarSizeInBits();
   SDLoc DL(N);
 
   // fold (ctpop c1) -> c2
   if (SDValue C = DAG.FoldConstantArithmetic(ISD::CTPOP, DL, VT, {N0}))
     return C;
+
+  // If the upper bits are known to be zero, then see if its profitable to
+  // only count the lower bits.
+  if (VT.isScalarInteger() && NumBits > 8 && (NumBits & 1) == 0) {
+    EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), NumBits / 2);
+    if (hasOperation(ISD::CTPOP, HalfVT) &&
+        TLI.isTypeDesirableForOp(ISD::CTPOP, HalfVT) &&
+        TLI.isTruncateFree(N0, HalfVT) && TLI.isZExtFree(HalfVT, VT)) {
+      APInt UpperBits = APInt::getHighBitsSet(NumBits, NumBits / 2);
+      if (DAG.MaskedValueIsZero(N0, UpperBits)) {
+        SDValue PopCnt = DAG.getNode(ISD::CTPOP, DL, HalfVT,
+                                     DAG.getZExtOrTrunc(N0, DL, HalfVT));
+        return DAG.getZExtOrTrunc(PopCnt, DL, VT);
+      }
+    }
+  }
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
index 1346678e51e3d..3b9c3e3ba1752 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
@@ -452,12 +452,11 @@ define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val)
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_and_b32 s4, s8, 0xff
 ; SI-NEXT:    s_mov_b32 s1, s5
-; SI-NEXT:    s_and_b32 s4, s8, 1
-; SI-NEXT:    s_mov_b32 s5, 0
-; SI-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; SI-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; SI-NEXT:    s_add_i32 s4, s6, s4
+; SI-NEXT:    s_bcnt1_i32_b32 s4, s4
+; SI-NEXT:    s_bcnt1_i32_b64 s5, s[6:7]
+; SI-NEXT:    s_add_i32 s4, s5, s4
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -470,12 +469,11 @@ define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val)
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_and_b32 s4, s8, 0xff
 ; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    s_and_b32 s4, s8, 1
-; VI-NEXT:    s_mov_b32 s5, 0
-; VI-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; VI-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; VI-NEXT:    s_add_i32 s4, s6, s4
+; VI-NEXT:    s_bcnt1_i32_b32 s4, s4
+; VI-NEXT:    s_bcnt1_i32_b64 s5, s[6:7]
+; VI-NEXT:    s_add_i32 s4, s5, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/X86/ctpop-mask.ll b/llvm/test/CodeGen/X86/ctpop-mask.ll
index abbcf22f77e43..e0a96a9f98879 100644
--- a/llvm/test/CodeGen/X86/ctpop-mask.ll
+++ b/llvm/test/CodeGen/X86/ctpop-mask.ll
@@ -25,7 +25,7 @@ define i64 @ctpop_mask2(i64 %x) nounwind readnone {
 ; X64-POPCOUNT-LABEL: ctpop_mask2:
 ; X64-POPCOUNT:       # %bb.0:
 ; X64-POPCOUNT-NEXT:    andl $3, %edi
-; X64-POPCOUNT-NEXT:    popcntq %rdi, %rax
+; X64-POPCOUNT-NEXT:    popcntl %edi, %eax
 ; X64-POPCOUNT-NEXT:    retq
 ;
 ; X86-NO-POPCOUNT-LABEL: ctpop_mask2:
@@ -189,7 +189,7 @@ define i64 @ctpop_mask4(i64 %x) nounwind readnone {
 ; X64-POPCOUNT-LABEL: ctpop_mask4:
 ; X64-POPCOUNT:       # %bb.0:
 ; X64-POPCOUNT-NEXT:    andl $15, %edi
-; X64-POPCOUNT-NEXT:    popcntq %rdi, %rax
+; X64-POPCOUNT-NEXT:    popcntl %edi, %eax
 ; X64-POPCOUNT-NEXT:    retq
 ;
 ; X86-NO-POPCOUNT-LABEL: ctpop_mask4:
@@ -271,7 +271,7 @@ define i64 @ctpop_mask5(i64 %x) nounwind readnone {
 ; X64-POPCOUNT-LABEL: ctpop_mask5:
 ; X64-POPCOUNT:       # %bb.0:
 ; X64-POPCOUNT-NEXT:    andl $31, %edi
-; X64-POPCOUNT-NEXT:    popcntq %rdi, %rax
+; X64-POPCOUNT-NEXT:    popcntl %edi, %eax
 ; X64-POPCOUNT-NEXT:    retq
 ;
 ; X86-NO-POPCOUNT-LABEL: ctpop_mask5:
@@ -392,7 +392,7 @@ define i64 @ctpop_shifted_mask6(i64 %x) nounwind readnone {
 ; X64-POPCOUNT-LABEL: ctpop_shifted_mask6:
 ; X64-POPCOUNT:       # %bb.0:
 ; X64-POPCOUNT-NEXT:    andl $26112, %edi # imm = 0x6600
-; X64-POPCOUNT-NEXT:    popcntq %rdi, %rax
+; X64-POPCOUNT-NEXT:    popcntl %edi, %eax
 ; X64-POPCOUNT-NEXT:    retq
 ;
 ; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask6:
@@ -556,7 +556,7 @@ define i64 @ctpop_shifted_mask8(i64 %x) nounwind readnone {
 ; X64-POPCOUNT-LABEL: ctpop_shifted_mask8:
 ; X64-POPCOUNT:       # %bb.0:
 ; X64-POPCOUNT-NEXT:    andl $65280, %edi # imm = 0xFF00
-; X64-POPCOUNT-NEXT:    popcntq %rdi, %rax
+; X64-POPCOUNT-NEXT:    popcntl %edi, %eax
 ; X64-POPCOUNT-NEXT:    retq
 ;
 ; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask8:



More information about the llvm-commits mailing list