[llvm] 2258bc4 - [AMDGPU] Simplify, fix and improve known bits for mbcnt (#104768)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 19 10:22:09 PDT 2024
Author: Jay Foad
Date: 2024-08-19T18:22:06+01:00
New Revision: 2258bc429b6f76ade78059f3c4f6c7c77f93a996
URL: https://github.com/llvm/llvm-project/commit/2258bc429b6f76ade78059f3c4f6c7c77f93a996
DIFF: https://github.com/llvm/llvm-project/commit/2258bc429b6f76ade78059f3c4f6c7c77f93a996.diff
LOG: [AMDGPU] Simplify, fix and improve known bits for mbcnt (#104768)
Simplify by using KnownBits::add.
Fix GlobalISel path which was ignoring the known bits of src1.
Improve analysis of mbcnt.hi which adds at most 31 even in wave64.
Added:
Modified:
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 09fe4ac33c3267..ae55b56fbf43fb 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15758,16 +15758,12 @@ void SITargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
case Intrinsic::amdgcn_mbcnt_hi: {
const GCNSubtarget &ST =
DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
- // These return at most the (wavefront size - 1) + src1
- // As long as src1 is an immediate we can calc known bits
- KnownBits Src1Known = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
- unsigned Src1ValBits = Src1Known.countMaxActiveBits();
- unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2());
- // Cater for potential carry
- MaxActiveBits += Src1ValBits ? 1 : 0;
- unsigned Size = Op.getValueType().getSizeInBits();
- if (MaxActiveBits < Size)
- Known.Zero.setHighBits(Size - MaxActiveBits);
+ // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
+ // most 31 + src1.
+ Known.Zero.setBitsFrom(
+ IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
+ KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
+ Known = KnownBits::add(Known, Known2);
return;
}
}
@@ -15802,7 +15798,8 @@ void SITargetLowering::computeKnownBitsForTargetInstr(
switch (MI->getOpcode()) {
case AMDGPU::G_INTRINSIC:
case AMDGPU::G_INTRINSIC_CONVERGENT: {
- switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
+ Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
+ switch (IID) {
case Intrinsic::amdgcn_workitem_id_x:
knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
break;
@@ -15814,9 +15811,15 @@ void SITargetLowering::computeKnownBitsForTargetInstr(
break;
case Intrinsic::amdgcn_mbcnt_lo:
case Intrinsic::amdgcn_mbcnt_hi: {
- // These return at most the wavefront size - 1.
- unsigned Size = MRI.getType(R).getSizeInBits();
- Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2());
+ // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
+ // most 31 + src1.
+ Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
+ ? getSubtarget()->getWavefrontSizeLog2()
+ : 5);
+ KnownBits Known2;
+ KB.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
+ Depth + 1);
+ Known = KnownBits::add(Known, Known2);
break;
}
case Intrinsic::amdgcn_groupstaticsize: {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
index 4ac9dc8565ff40..fd1b99a9f30eb7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
@@ -1,10 +1,9 @@
-; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=verde < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=verde < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}mbcnt_intrinsics:
-; GCN: v_mbcnt_lo_u32_b32{{(_e64)*}} [[LO:v[0-9]+]], -1, 0
-; SI: v_mbcnt_hi_u32_b32_e32 {{v[0-9]+}}, -1, [[LO]]
-; VI: v_mbcnt_hi_u32_b32 {{v[0-9]+}}, -1, [[LO]]
+; GCN: v_mbcnt_lo_u32_b32_e64 [[LO:v[0-9]+]], -1, 0
+; GCN: v_mbcnt_hi_u32_b32_e32 {{v[0-9]+}}, -1, [[LO]]
define amdgpu_ps void @mbcnt_intrinsics(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3) {
main_body:
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
@@ -80,13 +79,25 @@ define i32 @mbcnt_hi_known_bits_3(i32 %x) #0 {
; GCN-LABEL: {{^}}mbcnt_hi_known_bits_4:
; GCN: v_mbcnt_hi_u32_b32
-; GCN: v_and_b32_e32
+; GCN-NOT: v_and_b32_e32
define i32 @mbcnt_hi_known_bits_4(i32 %x) #0 {
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 %x, i32 15)
%mask = and i32 %hi, 63
ret i32 %mask
}
+; TODO: Special case mbcnt.lo feeding into mbcnt.hi to remove this AND.
+; GCN-LABEL: {{^}}mbcnt_lo_hi_known_bits_1:
+; GCN: v_mbcnt_lo_u32_b32
+; GCN: v_mbcnt_hi_u32_b32
+; GCN: v_and_b32_e32
+define i32 @mbcnt_lo_hi_known_bits_1(i32 %x) #0 {
+ %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 %x, i32 0)
+ %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 %x, i32 %lo)
+ %mask = and i32 %hi, 63
+ ret i32 %mask
+}
+
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
More information about the llvm-commits
mailing list