[llvm] [AMDGPU] Add intrinsic exposing s_alloc_vgpr (PR #163951)
Diana Picus via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 17 03:06:05 PST 2025
https://github.com/rovka updated https://github.com/llvm/llvm-project/pull/163951
>From 96b38fbb35a69286b98e72f7f9c4a5d205724616 Mon Sep 17 00:00:00 2001
From: Diana Picus <diana-magda.picus at amd.com>
Date: Thu, 16 Oct 2025 13:48:09 +0200
Subject: [PATCH 1/3] [AMDGPU] Add intrinsic exposing s_alloc_vgpr
Make it possible to use `s_alloc_vgpr` at the IR level. This is a huge
footgun and use for anything other than compiler internal purposes is
heavily discouraged. The calling code must make sure that it does not
allocate fewer VGPRs than necessary - the intrinsic is NOT a request to
the backend to limit the number of VGPRs it uses (in essence it's not so
different from what we do with the dynamic VGPR flags of the
`amdgcn.cs.chain` intrinsic, it just makes it possible to use this
functionality in other scenarios).
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 11 ++++
.../AMDGPU/AMDGPUInstructionSelector.cpp | 16 +++++
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 4 ++
.../Target/AMDGPU/AMDGPUSearchableTables.td | 1 +
llvm/lib/Target/AMDGPU/SOPInstructions.td | 6 +-
.../AMDGPU/always_uniform.ll | 9 +++
.../AMDGPU/intrinsic-amdgcn-s-alloc-vgpr.ll | 59 +++++++++++++++++++
7 files changed, 104 insertions(+), 2 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/intrinsic-amdgcn-s-alloc-vgpr.ll
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index ded00b1274670..9bb305823e932 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -391,6 +391,17 @@ def int_amdgcn_s_wait_loadcnt : AMDGPUWaitIntrinsic;
def int_amdgcn_s_wait_samplecnt : AMDGPUWaitIntrinsic;
def int_amdgcn_s_wait_storecnt : AMDGPUWaitIntrinsic;
+// Force the VGPR allocation of the current wave to (at least) the given value.
+// The actual number of allocated VGPRs may be rounded up to match hardware
+// block boundaries.
+// It is the responsibility of the calling code to ensure it does not allocate
+// below the VGPR requirements of the current shader.
+def int_amdgcn_s_alloc_vgpr :
+ Intrinsic<
+ [llvm_i1_ty], // Returns true if the allocation succeeded, false otherwise.
+ [llvm_i32_ty], // The number of VGPRs to allocate.
+ [NoUndef<RetIndex>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+
def int_amdgcn_div_scale : DefaultAttrsIntrinsic<
// 1st parameter: Numerator
// 2nd parameter: Denominator
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 12915c7344426..2f9c87cb5f20e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2331,6 +2331,22 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
return selectDSBvhStackIntrinsic(I);
+ case Intrinsic::amdgcn_s_alloc_vgpr: {
+ // S_ALLOC_VGPR doesn't have a destination register, it just implicitly sets
+ // SCC. We then need to COPY it into the result vreg.
+ MachineBasicBlock *MBB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+
+ Register ResReg = I.getOperand(0).getReg();
+
+ MachineInstr *AllocMI = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_ALLOC_VGPR))
+ .add(I.getOperand(2));
+ MachineInstr *CopyMI = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), ResReg)
+ .addReg(AMDGPU::SCC);
+ I.eraseFromParent();
+ return constrainSelectedInstRegOperands(*AllocMI, TII, TRI, RBI) &&
+ RBI.constrainGenericRegister(ResReg, AMDGPU::SReg_32RegClass, *MRI);
+ }
case Intrinsic::amdgcn_s_barrier_init:
case Intrinsic::amdgcn_s_barrier_signal_var:
return selectNamedBarrierInit(I, IntrinsicID);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 56807a475537d..dda73f13f7487 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5359,6 +5359,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);
break;
+ case Intrinsic::amdgcn_s_alloc_vgpr:
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1);
+ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+ break;
case Intrinsic::amdgcn_s_sendmsg:
case Intrinsic::amdgcn_s_sendmsghalt: {
// This must be an SGPR, but accept a VGPR.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 2393346839707..b82b2416a57f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -409,6 +409,7 @@ def : AlwaysUniform<int_amdgcn_cluster_workgroup_max_flat_id>;
def : AlwaysUniform<int_amdgcn_workgroup_id_x>;
def : AlwaysUniform<int_amdgcn_workgroup_id_y>;
def : AlwaysUniform<int_amdgcn_workgroup_id_z>;
+def : AlwaysUniform<int_amdgcn_s_alloc_vgpr>;
def : AlwaysUniform<int_amdgcn_s_getpc>;
def : AlwaysUniform<int_amdgcn_s_getreg>;
def : AlwaysUniform<int_amdgcn_s_memrealtime>;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 84287b621fe78..9496087aec20c 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -433,8 +433,10 @@ let SubtargetPredicate = isGFX11Plus in {
} // End SubtargetPredicate = isGFX11Plus
let SubtargetPredicate = isGFX12Plus in {
- let hasSideEffects = 1, Defs = [SCC] in {
- def S_ALLOC_VGPR : SOP1_0_32 <"s_alloc_vgpr">;
+ let hasSideEffects = 1, isConvergent = 1, Defs = [SCC] in {
+ def S_ALLOC_VGPR : SOP1_0_32 <"s_alloc_vgpr",
+ [(set SCC, (int_amdgcn_s_alloc_vgpr SSrc_b32:$src0))]
+ >;
}
} // End SubtargetPredicate = isGFX12Plus
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
index 9ff670bee0f89..3f56f12f3cb34 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
@@ -183,6 +183,15 @@ define void @cluster_workgroup_max_flat_id(ptr addrspace(1) inreg %out) {
ret void
}
+; CHECK-LABEL: for function 's_alloc_vgpr':
+; CHECK: ALL VALUES UNIFORM
+define void @s_alloc_vgpr(i32 inreg %n, ptr addrspace(1) inreg %out) {
+ %scc = call i1 @llvm.amdgcn.s.alloc.vgpr(i32 %n)
+ %sel = select i1 %scc, i32 1, i32 0
+ store i32 %sel, ptr addrspace(1) %out
+ ret void
+}
+
; CHECK-LABEL: for function 's_memtime':
; CHECK: ALL VALUES UNIFORM
define void @s_memtime(ptr addrspace(1) inreg %out) {
diff --git a/llvm/test/CodeGen/AMDGPU/intrinsic-amdgcn-s-alloc-vgpr.ll b/llvm/test/CodeGen/AMDGPU/intrinsic-amdgcn-s-alloc-vgpr.ll
new file mode 100644
index 0000000000000..74c42b7bffd04
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/intrinsic-amdgcn-s-alloc-vgpr.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1250 < %s | FileCheck %s --check-prefix=GISEL
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1250 < %s | FileCheck %s --check-prefix=DAGISEL
+
+declare i1 @llvm.amdgcn.s.alloc.vgpr(i32)
+
+define amdgpu_cs void @test_alloc_vreg_const(ptr addrspace(1) %out) #0 {
+; GISEL-LABEL: test_alloc_vreg_const:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_alloc_vgpr 45
+; GISEL-NEXT: s_cselect_b32 s0, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_and_b32 s0, s0, 1
+; GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GISEL-NEXT: s_endpgm
+;
+; DAGISEL-LABEL: test_alloc_vreg_const:
+; DAGISEL: ; %bb.0: ; %entry
+; DAGISEL-NEXT: s_alloc_vgpr 45
+; DAGISEL-NEXT: s_cselect_b32 s0, -1, 0
+; DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; DAGISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; DAGISEL-NEXT: global_store_b32 v[0:1], v2, off
+; DAGISEL-NEXT: s_endpgm
+entry:
+ %scc = call i1 @llvm.amdgcn.s.alloc.vgpr(i32 45)
+ %sel = select i1 %scc, i32 1, i32 0
+ store i32 %sel, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_alloc_vreg_var(i32 inreg %n, ptr addrspace(1) %out) #0 {
+; GISEL-LABEL: test_alloc_vreg_var:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_alloc_vgpr s0
+; GISEL-NEXT: s_cselect_b32 s0, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_and_b32 s0, s0, 1
+; GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-NEXT: global_store_b32 v[0:1], v2, off
+; GISEL-NEXT: s_endpgm
+;
+; DAGISEL-LABEL: test_alloc_vreg_var:
+; DAGISEL: ; %bb.0: ; %entry
+; DAGISEL-NEXT: s_alloc_vgpr s0
+; DAGISEL-NEXT: s_cselect_b32 s0, -1, 0
+; DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; DAGISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; DAGISEL-NEXT: global_store_b32 v[0:1], v2, off
+; DAGISEL-NEXT: s_endpgm
+entry:
+ %scc = call i1 @llvm.amdgcn.s.alloc.vgpr(i32 %n)
+ %sel = select i1 %scc, i32 1, i32 0
+ store i32 %sel, ptr addrspace(1) %out
+ ret void
+}
+
+attributes #0 = { "amdgpu-dynamic-vgpr-block-sze" = "16" }
>From a429b95ac863348a561b4208c02f7ee6420ec7e0 Mon Sep 17 00:00:00 2001
From: Diana Picus <diana-magda.picus at amd.com>
Date: Fri, 17 Oct 2025 14:52:46 +0200
Subject: [PATCH 2/3] Silence warning
---
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 2f9c87cb5f20e..dd5d6351f37b6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2341,8 +2341,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
MachineInstr *AllocMI = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_ALLOC_VGPR))
.add(I.getOperand(2));
- MachineInstr *CopyMI = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), ResReg)
- .addReg(AMDGPU::SCC);
+ (void)BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), ResReg)
+ .addReg(AMDGPU::SCC);
I.eraseFromParent();
return constrainSelectedInstRegOperands(*AllocMI, TII, TRI, RBI) &&
RBI.constrainGenericRegister(ResReg, AMDGPU::SReg_32RegClass, *MRI);
>From b3bb4a5cdddad233525703695dc7e2ba8dd18b60 Mon Sep 17 00:00:00 2001
From: Diana Picus <diana-magda.picus at amd.com>
Date: Tue, 21 Oct 2025 10:54:53 +0200
Subject: [PATCH 3/3] Address review comments
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 9bb305823e932..2be917e5380fb 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -391,16 +391,15 @@ def int_amdgcn_s_wait_loadcnt : AMDGPUWaitIntrinsic;
def int_amdgcn_s_wait_samplecnt : AMDGPUWaitIntrinsic;
def int_amdgcn_s_wait_storecnt : AMDGPUWaitIntrinsic;
-// Force the VGPR allocation of the current wave to (at least) the given value.
-// The actual number of allocated VGPRs may be rounded up to match hardware
-// block boundaries.
+// Request the hardware to allocate the given number of VGPRs. The actual number
+// of allocated VGPRs may be rounded up to match hardware block boundaries.
// It is the responsibility of the calling code to ensure it does not allocate
// below the VGPR requirements of the current shader.
def int_amdgcn_s_alloc_vgpr :
- Intrinsic<
+ DefaultAttrsIntrinsic<
[llvm_i1_ty], // Returns true if the allocation succeeded, false otherwise.
[llvm_i32_ty], // The number of VGPRs to allocate.
- [NoUndef<RetIndex>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+ [NoUndef<RetIndex>, IntrNoMem, IntrHasSideEffects, IntrConvergent]>;
def int_amdgcn_div_scale : DefaultAttrsIntrinsic<
// 1st parameter: Numerator
More information about the llvm-commits
mailing list