[llvm] AMDGPU: Use "countMaxActiveBits() <= 5" to define uint5Bits (PR #115543)

Fri Nov 8 13:00:54 PST 2024

https://github.com/changpeng created https://github.com/llvm/llvm-project/pull/115543

  countMaxTrailingOnes() is not correct. This patch follows the suggestion from https://github.com/llvm/llvm-project/pull/115372.

>From a2bacf8ab58af4c1a0247026ea131443d6066602 Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang at amd.com>
Date: Fri, 8 Nov 2024 12:55:28 -0800
Subject: [PATCH] AMDGPU: Use "countMaxActiveBits() <= 5" to define uint5Bits

  countMaxTrailingOnes() is not correct. This patch follows the suggestion
from https://github.com/llvm/llvm-project/pull/115372.
---
 llvm/lib/Target/AMDGPU/SIInstructions.td    |  2 +-
 llvm/test/CodeGen/AMDGPU/extract-lowbits.ll | 25 +++++++++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 0658e030ffa5d6..755cbb7fb65492 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3554,7 +3554,7 @@ def : AMDGPUPat <
 >;
 
 def uint5Bits : PatLeaf<(i32 VGPR_32:$width), [{
-  return CurDAG->computeKnownBits(SDValue(N, 0)).countMaxTrailingOnes() <= 5;
+  return CurDAG->computeKnownBits(SDValue(N, 0)).countMaxActiveBits() <= 5;
 }]>;
 
 // x << (bitwidth - y) >> (bitwidth - y)
diff --git a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
index 3de8db2c6a448e..0e5a68773a6ba8 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
@@ -163,6 +163,31 @@ define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind {
   ret i32 %masked
 }
 
+define i32 @bzhi32_d0_even(i32 %val, i32 %numlowbits) nounwind {
+; SI-LABEL: bzhi32_d0_even:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
+; SI-NEXT:    v_sub_i32_e32 v1, vcc, 32, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: bzhi32_d0_even:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
+; VI-NEXT:    v_sub_u32_e32 v1, vcc, 32, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+  %times2 = shl i32 %numlowbits, 1
+  %numhighbits = sub i32 32, %times2
+  %highbitscleared = shl i32 %val, %numhighbits
+  %masked = lshr i32 %highbitscleared, %numhighbits
+  ret i32 %masked
+}
+
 define i32 @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 ; SI-LABEL: bzhi32_d1_indexzext:
 ; SI:       ; %bb.0: