[llvm] [DAG] visitCTPOP - if only the upper half of the ctpop operand is zero then see if its profitable to only count the lower half. (PR #80473)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 5 05:52:46 PST 2024
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/80473
>From 873365f02e239216512f33536762452fde45ab5d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Fri, 2 Feb 2024 18:16:27 +0000
Subject: [PATCH] [DAG] visitCTPOP - if only the upper half of the ctpop
operand is zero then see if its profitable to only count the lower half.
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 18 ++++++++++++++++++
llvm/test/CodeGen/AMDGPU/ctpop64.ll | 18 ++++++++----------
llvm/test/CodeGen/X86/ctpop-mask.ll | 10 +++++-----
3 files changed, 31 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 3ce45e0e43bf4..804823af8daf0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11142,11 +11142,29 @@ SDValue DAGCombiner::visitCTTZ_ZERO_UNDEF(SDNode *N) {
SDValue DAGCombiner::visitCTPOP(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
+ unsigned NumBits = VT.getScalarSizeInBits();
SDLoc DL(N);
// fold (ctpop c1) -> c2
if (SDValue C = DAG.FoldConstantArithmetic(ISD::CTPOP, DL, VT, {N0}))
return C;
+
+ // If the upper bits are known to be zero, then see if its profitable to
+ // only count the lower bits.
+ if (VT.isScalarInteger() && NumBits > 8 && (NumBits & 1) == 0) {
+ EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), NumBits / 2);
+ if (hasOperation(ISD::CTPOP, HalfVT) &&
+ TLI.isTypeDesirableForOp(ISD::CTPOP, HalfVT) &&
+ TLI.isTruncateFree(N0, HalfVT) && TLI.isZExtFree(HalfVT, VT)) {
+ APInt UpperBits = APInt::getHighBitsSet(NumBits, NumBits / 2);
+ if (DAG.MaskedValueIsZero(N0, UpperBits)) {
+ SDValue PopCnt = DAG.getNode(ISD::CTPOP, DL, HalfVT,
+ DAG.getZExtOrTrunc(N0, DL, HalfVT));
+ return DAG.getZExtOrTrunc(PopCnt, DL, VT);
+ }
+ }
+ }
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
index 1346678e51e3d..3b9c3e3ba1752 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
@@ -452,12 +452,11 @@ define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val)
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_and_b32 s4, s8, 0xff
; SI-NEXT: s_mov_b32 s1, s5
-; SI-NEXT: s_and_b32 s4, s8, 1
-; SI-NEXT: s_mov_b32 s5, 0
-; SI-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
-; SI-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
-; SI-NEXT: s_add_i32 s4, s6, s4
+; SI-NEXT: s_bcnt1_i32_b32 s4, s4
+; SI-NEXT: s_bcnt1_i32_b64 s5, s[6:7]
+; SI-NEXT: s_add_i32 s4, s5, s4
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -470,12 +469,11 @@ define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val)
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_and_b32 s4, s8, 0xff
; VI-NEXT: s_mov_b32 s1, s5
-; VI-NEXT: s_and_b32 s4, s8, 1
-; VI-NEXT: s_mov_b32 s5, 0
-; VI-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
-; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
-; VI-NEXT: s_add_i32 s4, s6, s4
+; VI-NEXT: s_bcnt1_i32_b32 s4, s4
+; VI-NEXT: s_bcnt1_i32_b64 s5, s[6:7]
+; VI-NEXT: s_add_i32 s4, s5, s4
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/X86/ctpop-mask.ll b/llvm/test/CodeGen/X86/ctpop-mask.ll
index abbcf22f77e43..e0a96a9f98879 100644
--- a/llvm/test/CodeGen/X86/ctpop-mask.ll
+++ b/llvm/test/CodeGen/X86/ctpop-mask.ll
@@ -25,7 +25,7 @@ define i64 @ctpop_mask2(i64 %x) nounwind readnone {
; X64-POPCOUNT-LABEL: ctpop_mask2:
; X64-POPCOUNT: # %bb.0:
; X64-POPCOUNT-NEXT: andl $3, %edi
-; X64-POPCOUNT-NEXT: popcntq %rdi, %rax
+; X64-POPCOUNT-NEXT: popcntl %edi, %eax
; X64-POPCOUNT-NEXT: retq
;
; X86-NO-POPCOUNT-LABEL: ctpop_mask2:
@@ -189,7 +189,7 @@ define i64 @ctpop_mask4(i64 %x) nounwind readnone {
; X64-POPCOUNT-LABEL: ctpop_mask4:
; X64-POPCOUNT: # %bb.0:
; X64-POPCOUNT-NEXT: andl $15, %edi
-; X64-POPCOUNT-NEXT: popcntq %rdi, %rax
+; X64-POPCOUNT-NEXT: popcntl %edi, %eax
; X64-POPCOUNT-NEXT: retq
;
; X86-NO-POPCOUNT-LABEL: ctpop_mask4:
@@ -271,7 +271,7 @@ define i64 @ctpop_mask5(i64 %x) nounwind readnone {
; X64-POPCOUNT-LABEL: ctpop_mask5:
; X64-POPCOUNT: # %bb.0:
; X64-POPCOUNT-NEXT: andl $31, %edi
-; X64-POPCOUNT-NEXT: popcntq %rdi, %rax
+; X64-POPCOUNT-NEXT: popcntl %edi, %eax
; X64-POPCOUNT-NEXT: retq
;
; X86-NO-POPCOUNT-LABEL: ctpop_mask5:
@@ -392,7 +392,7 @@ define i64 @ctpop_shifted_mask6(i64 %x) nounwind readnone {
; X64-POPCOUNT-LABEL: ctpop_shifted_mask6:
; X64-POPCOUNT: # %bb.0:
; X64-POPCOUNT-NEXT: andl $26112, %edi # imm = 0x6600
-; X64-POPCOUNT-NEXT: popcntq %rdi, %rax
+; X64-POPCOUNT-NEXT: popcntl %edi, %eax
; X64-POPCOUNT-NEXT: retq
;
; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask6:
@@ -556,7 +556,7 @@ define i64 @ctpop_shifted_mask8(i64 %x) nounwind readnone {
; X64-POPCOUNT-LABEL: ctpop_shifted_mask8:
; X64-POPCOUNT: # %bb.0:
; X64-POPCOUNT-NEXT: andl $65280, %edi # imm = 0xFF00
-; X64-POPCOUNT-NEXT: popcntq %rdi, %rax
+; X64-POPCOUNT-NEXT: popcntl %edi, %eax
; X64-POPCOUNT-NEXT: retq
;
; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask8:
More information about the llvm-commits
mailing list