[llvm] 611a648 - [AMDGPU] Add llvm.amdgcn.dead intrinsic (#123190)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 20 00:25:52 PST 2025
Author: Diana Picus
Date: 2025-02-20T09:25:48+01:00
New Revision: 611a648327e9f6dad174e5c4427b27b8b7830fc0
URL: https://github.com/llvm/llvm-project/commit/611a648327e9f6dad174e5c4427b27b8b7830fc0
DIFF: https://github.com/llvm/llvm-project/commit/611a648327e9f6dad174e5c4427b27b8b7830fc0.diff
LOG: [AMDGPU] Add llvm.amdgcn.dead intrinsic (#123190)
Shaders that use the llvm.amdgcn.init.whole.wave intrinsic need to
explicitly preserve the inactive lanes of VGPRs of interest by adding
them as dummy arguments. The code usually looks something like this:
```
define amdgcn_cs_chain void f(active vgpr args..., i32 %inactive.vgpr1, ..., i32 %inactive.vgprN) {
entry:
%c = call i1 @llvm.amdgcn.init.whole.wave()
br i1 %c, label %shader, label %tail
shader:
[...]
tail:
%inactive.vgpr.arg1 = phi i32 [ %inactive.vgpr1, %entry], [poison, %shader]
[...]
; %inactive.vgpr* then get passed into a llvm.amdgcn.cs.chain call
```
Unfortunately, this kind of phi node will get optimized away and the
backend won't be able to figure out that it's ok to use the active lanes
of `%inactive.vgpr*` inside `shader`.
This patch fixes the issue by introducing a llvm.amdgcn.dead intrinsic,
whose result can be used as a PHI operand instead of the poison. This
will be selected to an IMPLICIT_DEF, which the backend can work with.
At the moment, the llvm.amdgcn.dead intrinsic works only on i32 values.
Support for other types can be added later if needed.
Added:
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll
Modified:
llvm/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
llvm/lib/Target/AMDGPU/SIInstructions.td
llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 1e4f25c642493..876a6f816ad3f 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3463,4 +3463,11 @@ def int_amdgcn_addrspacecast_nonnull : DefaultAttrsIntrinsic<
[llvm_anyptr_ty], [llvm_anyptr_ty],
[IntrNoMem, IntrSpeculatable]
>;
+
+/// Make it clear to the backend that this value is really dead. For instance,
+/// when used as an input to a phi node, it will make it possible for the
+/// backend to allocate the dead lanes for operations within the corresponding
+/// incoming block.
+def int_amdgcn_dead: DefaultAttrsIntrinsic<[llvm_any_ty], [],
+ [IntrNoMem, IntrWillReturn, IntrNoCallback]>;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index cf3843869808b..28c5a53508556 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1190,6 +1190,12 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
case Intrinsic::amdgcn_permlane16_swap:
case Intrinsic::amdgcn_permlane32_swap:
return selectPermlaneSwapIntrin(I, IntrinsicID);
+ case Intrinsic::amdgcn_dead: {
+ I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
+ I.removeOperand(1); // drop intrinsic ID
+ return RBI.constrainGenericRegister(I.getOperand(0).getReg(),
+ AMDGPU::VGPR_32RegClass, *MRI);
+ }
default:
return selectImpl(I, *CoverageInfo);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 2e5f42c3bdc40..2693ad3894cca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4676,6 +4676,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_set_inactive_chain_arg:
case Intrinsic::amdgcn_permlane64:
case Intrinsic::amdgcn_ds_bpermute_fi_b32:
+ case Intrinsic::amdgcn_dead:
return getDefaultMappingAllVGPR(MI);
case Intrinsic::amdgcn_cvt_pkrtz:
if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 10175557fadc7..3b62dcf3c92cd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -362,6 +362,8 @@ def : SourceOfDivergence<int_amdgcn_inverse_ballot>;
foreach intr = AMDGPUImageDimAtomicIntrinsics in
def : SourceOfDivergence<intr>;
+def : SourceOfDivergence<int_amdgcn_dead>;
+
class AlwaysUniform<Intrinsic intr> {
Intrinsic Intr = intr;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 3faf0795157dc..598475763d02d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -4276,3 +4276,9 @@ def V_ILLEGAL : Enc32, InstSI<(outs), (ins), "v_illegal"> {
let hasSideEffects = 1;
let SubtargetPredicate = isGFX10Plus;
}
+
+// FIXME: Would be nice if we could set the register class for the destination
+// register too.
+def IMP_DEF_FROM_INTRINSIC: Pat<
+ (i32 (int_amdgcn_dead)), (IMPLICIT_DEF)>;
+
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index aa5208560817f..bb840023daf5d 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -520,7 +520,12 @@ define amdgpu_kernel void @v_permlane32_swap(ptr addrspace(1) %out, i32 %src0, i
ret void
}
-
+; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.dead.i32()
+define amdgpu_cs_chain void @dead(ptr addrspace(1) %out) {
+ %v = call i32 @llvm.amdgcn.dead.i32()
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1
@@ -558,5 +563,7 @@ declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1))
declare <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1))
declare <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16(ptr addrspace(1))
+declare i32 @llvm.amdgcn.dead.i32()
+
attributes #0 = { nounwind convergent }
attributes #1 = { nounwind readnone convergent }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll
new file mode 100644
index 0000000000000..a009854542f21
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll
@@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=ASM-DAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=ASM-GISEL %s
+
+; Test that we can use v0 for temporaries in the if.then block.
+define i32 @dead(i1 %cond, i32 %x, ptr addrspace(1) %ptr1, ptr addrspace(1) %ptr2) #0 {
+; ASM-DAG-LABEL: dead:
+; ASM-DAG: ; %bb.0: ; %entry
+; ASM-DAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; ASM-DAG-NEXT: s_wait_expcnt 0x0
+; ASM-DAG-NEXT: s_wait_samplecnt 0x0
+; ASM-DAG-NEXT: s_wait_bvhcnt 0x0
+; ASM-DAG-NEXT: s_wait_kmcnt 0x0
+; ASM-DAG-NEXT: v_mov_b32_e32 v4, v0
+; ASM-DAG-NEXT: v_mov_b32_e32 v0, v1
+; ASM-DAG-NEXT: s_mov_b32 s0, exec_lo
+; ASM-DAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; ASM-DAG-NEXT: v_and_b32_e32 v1, 1, v4
+; ASM-DAG-NEXT: v_cmpx_eq_u32_e32 1, v1
+; ASM-DAG-NEXT: s_cbranch_execz .LBB0_2
+; ASM-DAG-NEXT: ; %bb.1: ; %if.then
+; ASM-DAG-NEXT: v_add_nc_u32_e32 v0, 1, v0
+; ASM-DAG-NEXT: global_store_b32 v[2:3], v0, off
+; ASM-DAG-NEXT: ; implicit-def: $vgpr0
+; ASM-DAG-NEXT: .LBB0_2: ; %if.end
+; ASM-DAG-NEXT: s_wait_alu 0xfffe
+; ASM-DAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; ASM-DAG-NEXT: s_setpc_b64 s[30:31]
+;
+; ASM-GISEL-LABEL: dead:
+; ASM-GISEL: ; %bb.0: ; %entry
+; ASM-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; ASM-GISEL-NEXT: s_wait_expcnt 0x0
+; ASM-GISEL-NEXT: s_wait_samplecnt 0x0
+; ASM-GISEL-NEXT: s_wait_bvhcnt 0x0
+; ASM-GISEL-NEXT: s_wait_kmcnt 0x0
+; ASM-GISEL-NEXT: v_mov_b32_e32 v4, v0
+; ASM-GISEL-NEXT: v_mov_b32_e32 v0, v1
+; ASM-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; ASM-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; ASM-GISEL-NEXT: v_and_b32_e32 v1, 1, v4
+; ASM-GISEL-NEXT: v_cmpx_ne_u32_e32 0, v1
+; ASM-GISEL-NEXT: s_cbranch_execz .LBB0_2
+; ASM-GISEL-NEXT: ; %bb.1: ; %if.then
+; ASM-GISEL-NEXT: v_add_nc_u32_e32 v0, 1, v0
+; ASM-GISEL-NEXT: global_store_b32 v[2:3], v0, off
+; ASM-GISEL-NEXT: ; implicit-def: $vgpr0
+; ASM-GISEL-NEXT: .LBB0_2: ; %if.end
+; ASM-GISEL-NEXT: s_wait_alu 0xfffe
+; ASM-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; ASM-GISEL-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %dead = call i32 @llvm.amdgcn.dead.i32()
+ br i1 %cond, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ %temp = add i32 %x, 1
+ store i32 %temp, ptr addrspace(1) %ptr1
+ br label %if.end
+
+if.end:
+ %res = phi i32 [ %x, %entry ], [ %dead, %if.then ]
+ ret i32 %res
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
index 1bdaa4c98127d..110192ecefe55 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
@@ -1115,4 +1115,141 @@ tail:
unreachable
}
+; Since functions that contain amdgcn.init.whole.wave do not preserve the inactive
+; lanes of any VGPRs, the middle end will explicitly preserve them if needed by adding
+; dummy VGPR arguments. Since only the inactive lanes are important, we need to make
+; it clear to the backend that it's safe to allocate v9's active lanes inside
+; shader. This is achieved by using the llvm.amdgcn.dead intrinsic.
+define amdgpu_cs_chain void @with_inactive_vgprs(ptr inreg %callee, i32 inreg %exec, i32 inreg %sgpr, i32 %active.vgpr, i32 %inactive.vgpr) {
+; GISEL12-LABEL: with_inactive_vgprs:
+; GISEL12: ; %bb.0: ; %entry
+; GISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL12-NEXT: s_wait_expcnt 0x0
+; GISEL12-NEXT: s_wait_samplecnt 0x0
+; GISEL12-NEXT: s_wait_bvhcnt 0x0
+; GISEL12-NEXT: s_wait_kmcnt 0x0
+; GISEL12-NEXT: s_or_saveexec_b32 s6, -1
+; GISEL12-NEXT: s_mov_b32 s4, s0
+; GISEL12-NEXT: s_mov_b32 s5, s1
+; GISEL12-NEXT: s_mov_b32 s0, s3
+; GISEL12-NEXT: s_wait_alu 0xfffe
+; GISEL12-NEXT: s_and_saveexec_b32 s1, s6
+; GISEL12-NEXT: s_cbranch_execz .LBB6_2
+; GISEL12-NEXT: ; %bb.1: ; %shader
+; GISEL12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, s4
+; GISEL12-NEXT: flat_load_b32 v11, v[9:10]
+; GISEL12-NEXT: ;;#ASMSTART
+; GISEL12-NEXT: ; use v0-7
+; GISEL12-NEXT: ;;#ASMEND
+; GISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL12-NEXT: v_add_nc_u32_e32 v8, v8, v11
+; GISEL12-NEXT: flat_store_b32 v[9:10], v11
+; GISEL12-NEXT: ; implicit-def: $vgpr9
+; GISEL12-NEXT: .LBB6_2: ; %tail.block
+; GISEL12-NEXT: s_wait_alu 0xfffe
+; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GISEL12-NEXT: s_mov_b32 exec_lo, s2
+; GISEL12-NEXT: s_setpc_b64 s[4:5]
+;
+; DAGISEL12-LABEL: with_inactive_vgprs:
+; DAGISEL12: ; %bb.0: ; %entry
+; DAGISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL12-NEXT: s_wait_expcnt 0x0
+; DAGISEL12-NEXT: s_wait_samplecnt 0x0
+; DAGISEL12-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL12-NEXT: s_wait_kmcnt 0x0
+; DAGISEL12-NEXT: s_or_saveexec_b32 s6, -1
+; DAGISEL12-NEXT: s_mov_b32 s5, s1
+; DAGISEL12-NEXT: s_mov_b32 s4, s0
+; DAGISEL12-NEXT: s_wait_alu 0xfffe
+; DAGISEL12-NEXT: s_and_saveexec_b32 s0, s6
+; DAGISEL12-NEXT: s_cbranch_execz .LBB6_2
+; DAGISEL12-NEXT: ; %bb.1: ; %shader
+; DAGISEL12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, s4
+; DAGISEL12-NEXT: flat_load_b32 v11, v[9:10]
+; DAGISEL12-NEXT: ;;#ASMSTART
+; DAGISEL12-NEXT: ; use v0-7
+; DAGISEL12-NEXT: ;;#ASMEND
+; DAGISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL12-NEXT: v_add_nc_u32_e32 v8, v8, v11
+; DAGISEL12-NEXT: flat_store_b32 v[9:10], v11
+; DAGISEL12-NEXT: ; implicit-def: $vgpr9
+; DAGISEL12-NEXT: .LBB6_2: ; %tail.block
+; DAGISEL12-NEXT: s_wait_alu 0xfffe
+; DAGISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; DAGISEL12-NEXT: s_mov_b32 s0, s3
+; DAGISEL12-NEXT: s_mov_b32 exec_lo, s2
+; DAGISEL12-NEXT: s_wait_alu 0xfffe
+; DAGISEL12-NEXT: s_setpc_b64 s[4:5]
+;
+; GISEL10-LABEL: with_inactive_vgprs:
+; GISEL10: ; %bb.0: ; %entry
+; GISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL10-NEXT: s_or_saveexec_b32 s6, -1
+; GISEL10-NEXT: s_mov_b32 s4, s0
+; GISEL10-NEXT: s_mov_b32 s5, s1
+; GISEL10-NEXT: s_mov_b32 s0, s3
+; GISEL10-NEXT: s_and_saveexec_b32 s1, s6
+; GISEL10-NEXT: s_cbranch_execz .LBB6_2
+; GISEL10-NEXT: ; %bb.1: ; %shader
+; GISEL10-NEXT: v_mov_b32_e32 v10, s5
+; GISEL10-NEXT: v_mov_b32_e32 v9, s4
+; GISEL10-NEXT: flat_load_dword v11, v[9:10]
+; GISEL10-NEXT: ;;#ASMSTART
+; GISEL10-NEXT: ; use v0-7
+; GISEL10-NEXT: ;;#ASMEND
+; GISEL10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GISEL10-NEXT: v_add_nc_u32_e32 v8, v8, v11
+; GISEL10-NEXT: flat_store_dword v[9:10], v11
+; GISEL10-NEXT: ; implicit-def: $vgpr9
+; GISEL10-NEXT: .LBB6_2: ; %tail.block
+; GISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GISEL10-NEXT: s_mov_b32 exec_lo, s2
+; GISEL10-NEXT: s_setpc_b64 s[4:5]
+;
+; DAGISEL10-LABEL: with_inactive_vgprs:
+; DAGISEL10: ; %bb.0: ; %entry
+; DAGISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL10-NEXT: s_or_saveexec_b32 s6, -1
+; DAGISEL10-NEXT: s_mov_b32 s5, s1
+; DAGISEL10-NEXT: s_mov_b32 s4, s0
+; DAGISEL10-NEXT: s_and_saveexec_b32 s0, s6
+; DAGISEL10-NEXT: s_cbranch_execz .LBB6_2
+; DAGISEL10-NEXT: ; %bb.1: ; %shader
+; DAGISEL10-NEXT: v_mov_b32_e32 v10, s5
+; DAGISEL10-NEXT: v_mov_b32_e32 v9, s4
+; DAGISEL10-NEXT: flat_load_dword v11, v[9:10]
+; DAGISEL10-NEXT: ;;#ASMSTART
+; DAGISEL10-NEXT: ; use v0-7
+; DAGISEL10-NEXT: ;;#ASMEND
+; DAGISEL10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; DAGISEL10-NEXT: v_add_nc_u32_e32 v8, v8, v11
+; DAGISEL10-NEXT: flat_store_dword v[9:10], v11
+; DAGISEL10-NEXT: ; implicit-def: $vgpr9
+; DAGISEL10-NEXT: .LBB6_2: ; %tail.block
+; DAGISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; DAGISEL10-NEXT: s_mov_b32 s0, s3
+; DAGISEL10-NEXT: s_mov_b32 exec_lo, s2
+; DAGISEL10-NEXT: s_setpc_b64 s[4:5]
+entry:
+ %imp.def = call i32 @llvm.amdgcn.dead()
+ %initial.exec = call i1 @llvm.amdgcn.init.whole.wave()
+ br i1 %initial.exec, label %shader, label %tail.block
+
+shader: ; preds = %entry
+ %use.another.vgpr = load i32, ptr %callee ; smth that won't be moved past the inline asm
+ call void asm sideeffect "; use v0-7", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"()
+ store i32 %use.another.vgpr, ptr %callee
+ %active.vgpr.new = add i32 %active.vgpr, %use.another.vgpr
+ br label %tail.block
+
+tail.block: ; preds = %.exit27, %.exit49, %244, %243, %entry
+ %active.vgpr.arg = phi i32 [ %active.vgpr, %entry ], [ %active.vgpr.new, %shader ]
+ %inactive.vgpr.arg = phi i32 [ %inactive.vgpr, %entry ], [ %imp.def, %shader ]
+ %vgprs.0 = insertvalue { i32, i32 } poison, i32 %active.vgpr.arg, 0
+ %vgprs = insertvalue { i32, i32 } %vgprs.0, i32 %inactive.vgpr.arg, 1
+ call void (ptr, i32, i32, { i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.i32.sl_i32i32(ptr inreg %callee, i32 inreg %exec, i32 inreg %sgpr, { i32, i32} %vgprs, i32 0)
+ unreachable
+}
+
declare amdgpu_gfx <16 x i32> @write_v0_v15(<16 x i32>)
More information about the llvm-commits
mailing list