[llvm] 6c9a9d9 - [AMDGPU] Set inst_pref_size to maximum (#126981)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 3 10:40:35 PST 2025
Author: Stanislav Mekhanoshin
Date: 2025-03-03T10:40:31-08:00
New Revision: 6c9a9d9fe2371e586be8ecba8b9a2d129d1c0226
URL: https://github.com/llvm/llvm-project/commit/6c9a9d9fe2371e586be8ecba8b9a2d129d1c0226
DIFF: https://github.com/llvm/llvm-project/commit/6c9a9d9fe2371e586be8ecba8b9a2d129d1c0226.diff
LOG: [AMDGPU] Set inst_pref_size to maximum (#126981)
On gfx11 and gfx12 set initial instruction prefetch size to a
minimum of kernel size and maximum allowed value.
Fixes: SWDEV-513122
Added:
llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
llvm/lib/Target/AMDGPU/SIProgramInfo.h
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index b5e9d3759d608..31e0bd8d652bc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1230,18 +1230,18 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
ProgInfo.EXCPEnable = 0;
- if (STM.hasGFX90AInsts()) {
- // return ((Dst & ~Mask) | (Value << Shift))
- auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
- uint32_t Shift) {
- const auto *Shft = MCConstantExpr::create(Shift, Ctx);
- const auto *Msk = MCConstantExpr::create(Mask, Ctx);
- Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
- Dst = MCBinaryExpr::createOr(
- Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), Ctx);
- return Dst;
- };
+ // return ((Dst & ~Mask) | (Value << Shift))
+ auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
+ uint32_t Shift) {
+ const auto *Shft = MCConstantExpr::create(Shift, Ctx);
+ const auto *Msk = MCConstantExpr::create(Mask, Ctx);
+ Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
+ Dst = MCBinaryExpr::createOr(Dst, MCBinaryExpr::createShl(Value, Shft, Ctx),
+ Ctx);
+ return Dst;
+ };
+ if (STM.hasGFX90AInsts()) {
ProgInfo.ComputePGMRSrc3 =
SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
@@ -1268,6 +1268,26 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
", final occupancy is " + Twine(Occupancy));
F.getContext().diagnose(Diag);
}
+
+ if (isGFX11Plus(STM)) {
+ uint32_t CodeSizeInBytes = (uint32_t)std::min(
+ ProgInfo.getFunctionCodeSize(MF, true /* IsLowerBound */),
+ (uint64_t)std::numeric_limits<uint32_t>::max());
+ uint32_t CodeSizeInLines = divideCeil(CodeSizeInBytes, 128);
+ uint32_t Field, Shift, Width;
+ if (isGFX11(STM)) {
+ Field = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE;
+ Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT;
+ Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
+ } else {
+ Field = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE;
+ Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT;
+ Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
+ }
+ uint64_t InstPrefSize = std::min(CodeSizeInLines, (1u << Width) - 1);
+ ProgInfo.ComputePGMRSrc3 = SetBits(ProgInfo.ComputePGMRSrc3,
+ CreateExpr(InstPrefSize), Field, Shift);
+ }
}
static unsigned getRsrcReg(CallingConv::ID CallConv) {
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
index 82e84b5fc1640..ef72690b91662 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
@@ -202,8 +202,9 @@ const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC,
return MCConstantExpr::create(0, Ctx);
}
-uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) {
- if (CodeSizeInBytes.has_value())
+uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF,
+ bool IsLowerBound) {
+ if (!IsLowerBound && CodeSizeInBytes.has_value())
return *CodeSizeInBytes;
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
@@ -216,7 +217,8 @@ uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) {
// overestimated. In case of inline asm used getInstSizeInBytes() will
// return a maximum size of a single instruction, where the real size may
//
diff er. At this point CodeSize may be already off.
- CodeSize = alignTo(CodeSize, MBB.getAlignment());
+ if (!IsLowerBound)
+ CodeSize = alignTo(CodeSize, MBB.getAlignment());
for (const MachineInstr &MI : MBB) {
// TODO: CodeSize should account for multiple functions.
@@ -224,6 +226,11 @@ uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) {
if (MI.isMetaInstruction())
continue;
+ // We cannot properly estimate inline asm size. It can be as small as zero
+ // if that is just a comment.
+ if (IsLowerBound && MI.isInlineAsm())
+ continue;
+
CodeSize += TII->getInstSizeInBytes(MI);
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index 3f68b0255a375..35c8d58f3c476 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -101,7 +101,10 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
void reset(const MachineFunction &MF);
// Get function code size and cache the value.
- uint64_t getFunctionCodeSize(const MachineFunction &MF);
+ // If \p IsLowerBound is set it returns a minimal code size which is safe
+ // to address.
+ uint64_t getFunctionCodeSize(const MachineFunction &MF,
+ bool IsLowerBound = false);
/// Compute the value of the ComputePGMRsrc1 register.
const MCExpr *getComputePGMRSrc1(const GCNSubtarget &ST,
diff --git a/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll b/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll
new file mode 100644
index 0000000000000..580167076e1f0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll
@@ -0,0 +1,32 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 --amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 --amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX12 %s
+
+; GCN-LABEL: .amdhsa_kernel large
+; GFX11: .amdhsa_inst_pref_size 3
+; GFX11: codeLenInByte = 3{{[0-9][0-9]$}}
+; GFX12: .amdhsa_inst_pref_size 4
+; GFX12: codeLenInByte = 4{{[0-9][0-9]$}}
+define amdgpu_kernel void @large(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+bb:
+ call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 256, i1 false)
+ ret void
+}
+
+; GCN-LABEL: .amdhsa_kernel small
+; GCN: .amdhsa_inst_pref_size 1
+; GCN: codeLenInByte = {{[0-9]$}}
+define amdgpu_kernel void @small() {
+bb:
+ ret void
+}
+
+; Ignore inline asm in size calculation
+
+; GCN-LABEL: .amdhsa_kernel inline_asm
+; GCN: .amdhsa_inst_pref_size 1
+; GCN: codeLenInByte = {{[0-9]$}}
+define amdgpu_kernel void @inline_asm() {
+bb:
+ call void asm sideeffect ".fill 256, 4, 0", ""()
+ ret void
+}
More information about the llvm-commits
mailing list