[llvm] [AMDGPU][GlobalISel] Enable kernel argument preloading (PR #134655)
Tim Gymnich via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 7 06:52:40 PDT 2025
https://github.com/tgymnich created https://github.com/llvm/llvm-project/pull/134655
- enable kernel argument preloading
>From 3a8bcf84c306ac8e578c0cfbda9764e6f3224774 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Fri, 14 Mar 2025 09:31:48 +0000
Subject: [PATCH 1/2] [AMDGPU][GlobalISel] Enable kernel argument preloading
---
llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 42 ++++++++++++++++++-
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 4 +-
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 6 +--
llvm/lib/Target/AMDGPU/R600ISelLowering.cpp | 2 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +-
5 files changed, 45 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index a15f193549936..48c65f37d9ef8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -20,6 +20,7 @@
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/LowLevelTypeUtils.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -507,6 +508,7 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
const SITargetLowering &TLI = *getTLI<SITargetLowering>();
const DataLayout &DL = F.getDataLayout();
+ const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(F.getCallingConv());
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
@@ -520,7 +522,6 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
// TODO: Align down to dword alignment and extract bits for extending loads.
for (auto &Arg : F.args()) {
- // TODO: Add support for kernarg preload.
if (Arg.hasAttribute("amdgpu-hidden-argument")) {
LLVM_DEBUG(dbgs() << "Preloading hidden arguments is not supported\n");
return false;
@@ -545,11 +546,47 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset);
- if (IsByRef) {
+ if (Arg.hasInRegAttr() && IsEntryFunc && Subtarget->hasKernargPreload()) {
+ unsigned NumAllocSGPRs = alignTo(DL.getTypeSizeInBits(ArgTy), 32) / 32;
+
+ unsigned Padding = ArgOffset - BaseOffset;
+ unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
+
+ // Preload this argument.
+ const TargetRegisterClass *RC =
+ TRI->getSGPRClassForBitWidth(NumAllocSGPRs * 32);
+ LLT RegTy = getLLTForType(*ArgTy, DL);
+ SmallVectorImpl<MCRegister> *PreloadRegs =
+ Info->addPreloadedKernArg(*TRI, RC, NumAllocSGPRs, i, PaddingSGPRs);
+
+ SmallVector<Register> SrcRegs;
+
+ if (PreloadRegs->size() > 1) {
+ RC = &AMDGPU::SGPR_32RegClass;
+ RegTy = LLT::scalar(32);
+ }
+
+ for (auto &PhysReg : *PreloadRegs) {
+ assert(PhysReg);
+ Register VReg = MF.addLiveIn(PhysReg, RC);
+ B.getMBB().addLiveIn(PhysReg);
+ MRI.setType(VReg, RegTy);
+ B.buildCopy(VReg, Register(PhysReg));
+ CCInfo.AllocateReg(PhysReg);
+ SrcRegs.push_back(VReg);
+ }
+
+ if (SrcRegs.size() > 1) {
+ B.buildMergeLikeInstr(VRegs[i][0], SrcRegs);
+ } else {
+ MRI.replaceRegWith(SrcRegs[0], VRegs[i][0]);
+ }
+ } else if (IsByRef) {
unsigned ByRefAS = cast<PointerType>(Arg.getType())->getAddressSpace();
assert(VRegs[i].size() == 1 &&
"expected only one register for byval pointers");
+
if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) {
lowerParameterPtr(VRegs[i][0], B, ArgOffset);
} else {
@@ -570,6 +607,7 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
}
TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
+
TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 533ad349f7500..0d5c0a71ce7af 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1193,9 +1193,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
/// for each individual part is i8. We pass the memory type as LocVT to the
/// calling convention analysis function and the register type (Ins[x].VT) as
/// the ValVT.
-void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
- CCState &State,
- const SmallVectorImpl<ISD::InputArg> &Ins) const {
+void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State) const {
const MachineFunction &MF = State.getMachineFunction();
const Function &Fn = MF.getFunction();
LLVMContext &Ctx = Fn.getParent()->getContext();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 6705f86e15fc2..2a69e8faab2a8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -172,13 +172,11 @@ class AMDGPUTargetLowering : public TargetLowering {
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &Results) const;
- void analyzeFormalArgumentsCompute(
- CCState &State,
- const SmallVectorImpl<ISD::InputArg> &Ins) const;
-
public:
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
+ void analyzeFormalArgumentsCompute(CCState &State) const;
+
bool mayIgnoreSignedZero(SDValue Op) const;
static inline SDValue stripBitcast(SDValue Val) {
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 157ca4b08020a..6c074ef8f396f 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -1463,7 +1463,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
if (AMDGPU::isShader(CallConv)) {
CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
} else {
- analyzeFormalArgumentsCompute(CCInfo, Ins);
+ analyzeFormalArgumentsCompute(CCInfo);
}
for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 356040da95672..a2288d136418f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2929,7 +2929,7 @@ SDValue SITargetLowering::LowerFormalArguments(
}
if (IsKernel)
- analyzeFormalArgumentsCompute(CCInfo, Ins);
+ analyzeFormalArgumentsCompute(CCInfo);
if (IsEntryFunc) {
allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
>From 5826b8cba568166e7750447a8802040e09b7188a Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Mon, 17 Mar 2025 23:25:08 +0000
Subject: [PATCH 2/2] update test
---
.../amdhsa-kernarg-preload-num-sgprs.ll | 133 ++++++++++++------
1 file changed, 90 insertions(+), 43 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
index dd760c2a215ca..91fb3d0f84546 100644
--- a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
@@ -1,18 +1,32 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefix=OBJDUMP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefix=ASM %s
-
-; OBJDUMP: Contents of section .rodata:
-; OBJDUMP-NEXT: 0000 00000000 00000000 10010000 00000000 ................
-; OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000 ................
-; OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000 ................
-; OBJDUMP-NOT: 0030 0000af00 94130000 1a000400 00000000 ................
-; OBJDUMP-NEXT: 0030 8000af00 98130000 1e000400 00000000 ................
-
-; ASM-LABEL: amdhsa_kernarg_preload_4_implicit_6:
-; ASM: .amdhsa_user_sgpr_count 12
-; ASM: .amdhsa_next_free_sgpr 12
-; ASM: ; TotalNumSgprs: 18
-; ASM: ; NumSGPRsForWavesPerEU: 18
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefixes=OBJDUMP-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefixes=OBJDUMP-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefixes=ASM-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefixes=ASM-GISEL %s
+
+; OBJDUMP-SDAG: Contents of section .rodata:
+; OBJDUMP-SDAG-NEXT: 0000 00000000 00000000 10010000 00000000 ................
+; OBJDUMP-SDAG-NEXT: 0010 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-SDAG-NEXT: 0020 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-SDAG-NOT: 0030 0000af00 94130000 1a000400 00000000 ................
+; OBJDUMP-SDAG-NEXT: 0030 8000af00 98130000 1e000400 00000000 ................
+
+; OBJDUMP-GISEL: Contents of section .rodata:
+; OBJDUMP-GISEL-NEXT: 0000 00000000 00000000 10010000 00000000 ................
+; OBJDUMP-GISEL-NEXT: 0010 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-GISEL-NEXT: 0020 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-GISEL-NEXT: 0030 0000af00 90130000 1e000000 00000000 ................
+
+; ASM-SDAG-LABEL: amdhsa_kernarg_preload_4_implicit_6:
+; ASM-SDAG: .amdhsa_user_sgpr_count 12
+; ASM-SDAG: .amdhsa_next_free_sgpr 12
+; ASM-SDAG: ; TotalNumSgprs: 18
+; ASM-SDAG: ; NumSGPRsForWavesPerEU: 18
+
+; ASM-GISEL-LABEL: amdhsa_kernarg_preload_4_implicit_6:
+; ASM-GISEL: .amdhsa_user_sgpr_count 8
+; ASM-GISEL: .amdhsa_next_free_sgpr 0
+; ASM-GISEL: ; TotalNumSgprs: 6
+; ASM-GISEL: ; NumSGPRsForWavesPerEU: 6
; Test that we include preloaded SGPRs in the GRANULATED_WAVEFRONT_SGPR_COUNT
; feild that are not explicitly referenced in the kernel. This test has 6 implicit
@@ -23,47 +37,80 @@
define amdgpu_kernel void @amdhsa_kernarg_preload_4_implicit_6(i128 inreg) { ret void }
-; OBJDUMP-NEXT: 0040 00000000 00000000 20010000 00000000 ........ .......
-; OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 ................
-; OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000 ................
-; OBJDUMP-NEXT: 0070 4000af00 94000000 08000800 00000000 @...............
+; OBJDUMP-SDAG-NEXT: 0040 00000000 00000000 20010000 00000000 ........ .......
+; OBJDUMP-SDAG-NEXT: 0050 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-SDAG-NEXT: 0060 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-SDAG-NEXT: 0070 4000af00 94000000 08000800 00000000 @...............
-; ASM-LABEL: amdhsa_kernarg_preload_8_implicit_2:
-; ASM: .amdhsa_user_sgpr_count 10
-; ASM: .amdhsa_next_free_sgpr 10
-; ASM: ; TotalNumSgprs: 16
-; ASM: ; NumSGPRsForWavesPerEU: 16
+; OBJDUMP-GISEL-NEXT: 0040 00000000 00000000 20010000 00000000 ........ .......
+; OBJDUMP-GISEL-NEXT: 0050 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-GISEL-NEXT: 0060 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-GISEL-NEXT: 0070 0000af00 84000000 08000000 00000000 ................
+
+; ASM-SDAG-LABEL: amdhsa_kernarg_preload_8_implicit_2:
+; ASM-SDAG: .amdhsa_user_sgpr_count 10
+; ASM-SDAG: .amdhsa_next_free_sgpr 10
+; ASM-SDAG: ; TotalNumSgprs: 16
+; ASM-SDAG: ; NumSGPRsForWavesPerEU: 16
+
+; ASM-GISEL-LABEL: amdhsa_kernarg_preload_8_implicit_2:
+; ASM-GISEL: .amdhsa_user_sgpr_count 2
+; ASM-GISEL: .amdhsa_next_free_sgpr 0
+; ASM-GISEL: ; TotalNumSgprs: 6
+; ASM-GISEL: ; NumSGPRsForWavesPerEU: 6
; Only the kernarg_ptr is enabled so we should have 8 preload kernarg SGPRs, 2
; implicit, and 6 extra.
define amdgpu_kernel void @amdhsa_kernarg_preload_8_implicit_2(i256 inreg) #0 { ret void }
-; OBJDUMP-NEXT: 0080 00000000 00000000 08010000 00000000 ................
-; OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000 ................
-; OBJDUMP-NEXT: 00a0 00000000 00000000 00000000 00000000 ................
-; OBJDUMP-NEXT: 00b0 4000af00 86000000 08000100 00000000 @...............
+; OBJDUMP-SDAG-NEXT: 0080 00000000 00000000 08010000 00000000 ................
+; OBJDUMP-SDAG-NEXT: 0090 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-SDAG-NEXT: 00a0 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-SDAG-NEXT: 00b0 4000af00 86000000 08000100 00000000 @...............
+
+; OBJDUMP-GISEL-NEXT: 0080 00000000 00000000 08010000 00000000 ................
+; OBJDUMP-GISEL-NEXT: 0090 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-GISEL-NEXT: 00a0 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-GISEL-NEXT: 00b0 0000af00 84000000 08000000 00000000 ................
-; ASM-LABEL: amdhsa_kernarg_preload_1_implicit_2:
-; ASM: .amdhsa_user_sgpr_count 3
-; ASM: .amdhsa_next_free_sgpr 3
-; ASM: ; TotalNumSgprs: 9
-; ASM: ; NumSGPRsForWavesPerEU: 9
+; ASM-SDAG-LABEL: amdhsa_kernarg_preload_1_implicit_2:
+; ASM-SDAG: .amdhsa_user_sgpr_count 3
+; ASM-SDAG: .amdhsa_next_free_sgpr 3
+; ASM-SDAG: ; TotalNumSgprs: 9
+; ASM-SDAG: ; NumSGPRsForWavesPerEU: 9
+
+; ASM-GISEL-LABEL: amdhsa_kernarg_preload_1_implicit_2:
+; ASM-GISEL: .amdhsa_user_sgpr_count 2
+; ASM-GISEL: .amdhsa_next_free_sgpr 0
+; ASM-GISEL: ; TotalNumSgprs: 6
+; ASM-GISEL: ; NumSGPRsForWavesPerEU: 6
; 1 preload, 2 implicit, 6 extra. Rounds up to 16 SGPRs in the KD.
define amdgpu_kernel void @amdhsa_kernarg_preload_1_implicit_2(i32 inreg) #0 { ret void }
-; OBJDUMP-NEXT: 00c0 00000000 00000000 08010000 00000000 ................
-; OBJDUMP-NEXT: 00d0 00000000 00000000 00000000 00000000 ................
-; OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000 ................
-; OBJDUMP-NEXT: 00f0 0000af00 84000000 08000000 00000000 ................
-
-; ASM-LABEL: amdhsa_kernarg_preload_0_implicit_2:
-; ASM: .amdhsa_user_sgpr_count 2
-; ASM: .amdhsa_next_free_sgpr 0
-; ASM: ; TotalNumSgprs: 6
-; ASM: ; NumSGPRsForWavesPerEU: 6
+; OBJDUMP-SDAG-NEXT: 00c0 00000000 00000000 08010000 00000000 ................
+; OBJDUMP-SDAG-NEXT: 00d0 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-SDAG-NEXT: 00e0 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-SDAG-NEXT: 00f0 0000af00 84000000 08000000 00000000 ................
+
+; OBJDUMP-GISEL-NEXT: 00c0 00000000 00000000 08010000 00000000 ................
+; OBJDUMP-GISEL-NEXT: 00d0 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-GISEL-NEXT: 00e0 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-GISEL-NEXT: 00f0 0000af00 84000000 08000000 00000000 ................
+
+; ASM-SDAG-LABEL: amdhsa_kernarg_preload_0_implicit_2:
+; ASM-SDAG: .amdhsa_user_sgpr_count 2
+; ASM-SDAG: .amdhsa_next_free_sgpr 0
+; ASM-SDAG: ; TotalNumSgprs: 6
+; ASM-SDAG: ; NumSGPRsForWavesPerEU: 6
+
+; ASM-GISEL-LABEL: amdhsa_kernarg_preload_0_implicit_2:
+; ASM-GISEL: .amdhsa_user_sgpr_count 2
+; ASM-GISEL: .amdhsa_next_free_sgpr 0
+; ASM-GISEL: ; TotalNumSgprs: 6
+; ASM-GISEL: ; NumSGPRsForWavesPerEU: 6
; 0 preload kernarg SGPRs, 2 implicit, 6 extra. Rounds up to 8 SGPRs in the KD.
; Encoded like '00'.
More information about the llvm-commits
mailing list