[llvm] [AMDGPU] Set inst_pref_size to maximum (PR #126981)
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 12 14:13:45 PST 2025
https://github.com/rampitec created https://github.com/llvm/llvm-project/pull/126981
On gfx11 and gfx12 set initial instruction prefetch size to a
minimum of kernel size and maximum allowed value.
Fixes: SWDEV-513122
>From ddd77604df9e273fc963d7defad012f1fafca2e1 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Wed, 12 Feb 2025 12:25:27 -0800
Subject: [PATCH] [AMDGPU] Set inst_pref_size to maximum
On gfx11 and gfx12 set initial instruction prefetch size to a
minimum of kernel size and maximum allowed value.
Fixes: SWDEV-513122
---
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 93 ++++++++++---------
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h | 2 -
llvm/lib/Target/AMDGPU/SIProgramInfo.cpp | 29 +++++-
llvm/lib/Target/AMDGPU/SIProgramInfo.h | 8 +-
.../test/CodeGen/AMDGPU/inst-prefetch-hint.ll | 21 +++++
5 files changed, 103 insertions(+), 50 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 031d8f0560ff2..d1d5b9a79ec5a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -622,12 +622,13 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
int64_t PGRM_Rsrc3 = 1;
bool EvaluatableRsrc3 =
- CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(PGRM_Rsrc3);
+ CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGRM_Rsrc3);
(void)PGRM_Rsrc3;
(void)EvaluatableRsrc3;
- assert(STM.hasGFX90AInsts() || !EvaluatableRsrc3 ||
+ assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
+ STM.hasGFX90AInsts() || !EvaluatableRsrc3 ||
static_cast<uint64_t>(PGRM_Rsrc3) == 0);
- KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A;
+ KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3;
KernelDescriptor.kernarg_preload = MCConstantExpr::create(
AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
@@ -748,7 +749,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
OutContext, IsLocal)
->getVariableValue(),
- getFunctionCodeSize(MF), MFI);
+ CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
return false;
}
@@ -757,7 +758,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
CurrentProgramInfo.NumArchVGPR,
STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
- CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI);
+ CurrentProgramInfo.ScratchSize,
+ CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
OutStreamer->emitRawComment(
" FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
@@ -821,22 +823,22 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
false);
[[maybe_unused]] int64_t PGMRSrc3;
- assert(STM.hasGFX90AInsts() ||
- (CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(
- PGMRSrc3) &&
+ assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
+ STM.hasGFX90AInsts() ||
+ (CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) &&
static_cast<uint64_t>(PGMRSrc3) == 0));
if (STM.hasGFX90AInsts()) {
OutStreamer->emitRawComment(
" COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
getMCExprStr(MCKernelDescriptor::bits_get(
- CurrentProgramInfo.ComputePGMRSrc3GFX90A,
+ CurrentProgramInfo.ComputePGMRSrc3,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
false);
OutStreamer->emitRawComment(
" COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
getMCExprStr(MCKernelDescriptor::bits_get(
- CurrentProgramInfo.ComputePGMRSrc3GFX90A,
+ CurrentProgramInfo.ComputePGMRSrc3,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
false);
@@ -893,27 +895,6 @@ void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
}
}
-uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
- const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
- const SIInstrInfo *TII = STM.getInstrInfo();
-
- uint64_t CodeSize = 0;
-
- for (const MachineBasicBlock &MBB : MF) {
- for (const MachineInstr &MI : MBB) {
- // TODO: CodeSize should account for multiple functions.
-
- // TODO: Should we count size of debug info?
- if (MI.isDebugInstr())
- continue;
-
- CodeSize += TII->getInstSizeInBytes(MI);
- }
- }
-
- return CodeSize;
-}
-
// AccumOffset computed for the MCExpr equivalent of:
// alignTo(std::max(1, NumVGPR), 4) / 4 - 1;
static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) {
@@ -1249,24 +1230,24 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
ProgInfo.EXCPEnable = 0;
+ // return ((Dst & ~Mask) | (Value << Shift))
+ auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
+ uint32_t Shift) {
+ const auto *Shft = MCConstantExpr::create(Shift, Ctx);
+ const auto *Msk = MCConstantExpr::create(Mask, Ctx);
+ Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
+ Dst = MCBinaryExpr::createOr(Dst, MCBinaryExpr::createShl(Value, Shft, Ctx),
+ Ctx);
+ return Dst;
+ };
+
if (STM.hasGFX90AInsts()) {
- // return ((Dst & ~Mask) | (Value << Shift))
- auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
- uint32_t Shift) {
- const auto *Shft = MCConstantExpr::create(Shift, Ctx);
- const auto *Msk = MCConstantExpr::create(Mask, Ctx);
- Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
- Dst = MCBinaryExpr::createOr(
- Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), Ctx);
- return Dst;
- };
-
- ProgInfo.ComputePGMRSrc3GFX90A =
- SetBits(ProgInfo.ComputePGMRSrc3GFX90A, ProgInfo.AccumOffset,
+ ProgInfo.ComputePGMRSrc3 =
+ SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
- ProgInfo.ComputePGMRSrc3GFX90A =
- SetBits(ProgInfo.ComputePGMRSrc3GFX90A, CreateExpr(ProgInfo.TgSplit),
+ ProgInfo.ComputePGMRSrc3 =
+ SetBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit),
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
}
@@ -1287,6 +1268,26 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
", final occupancy is " + Twine(Occupancy));
F.getContext().diagnose(Diag);
}
+
+ if (isGFX11Plus(STM)) {
+ uint32_t CodeSizeInBytes =
+ (uint32_t)std::min(ProgInfo.getFunctionCodeSize(MF),
+ (uint64_t)std::numeric_limits<uint32_t>::max());
+ uint32_t CodeSizeInLines = divideCeil(CodeSizeInBytes, 128);
+ uint32_t Field, Shift, Width;
+ if (isGFX11(STM)) {
+ Field = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE;
+ Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT;
+ Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
+ } else {
+ Field = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE;
+ Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT;
+ Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
+ }
+ uint64_t InstPrefSize = std::min(CodeSizeInLines, (1u << Width) - 1);
+ ProgInfo.ComputePGMRSrc3 = SetBits(ProgInfo.ComputePGMRSrc3,
+ CreateExpr(InstPrefSize), Field, Shift);
+ }
}
static unsigned getRsrcReg(CallingConv::ID CallConv) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index cc8c4411805e2..2c959d7dbbd07 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -50,8 +50,6 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
MCCodeEmitter *DumpCodeInstEmitter = nullptr;
- uint64_t getFunctionCodeSize(const MachineFunction &MF) const;
-
void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF);
void getAmdKernelCode(AMDGPU::AMDGPUMCKernelCodeT &Out,
const SIProgramInfo &KernelInfo,
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
index 212edff097837..85ea3a9e17e09 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
@@ -27,6 +27,8 @@ void SIProgramInfo::reset(const MachineFunction &MF) {
const MCExpr *ZeroExpr = MCConstantExpr::create(0, Ctx);
+ CodeSizeInBytes.reset();
+
VGPRBlocks = ZeroExpr;
SGPRBlocks = ZeroExpr;
Priority = 0;
@@ -55,7 +57,7 @@ void SIProgramInfo::reset(const MachineFunction &MF) {
LdsSize = 0;
EXCPEnable = 0;
- ComputePGMRSrc3GFX90A = ZeroExpr;
+ ComputePGMRSrc3 = ZeroExpr;
NumVGPR = ZeroExpr;
NumArchVGPR = ZeroExpr;
@@ -199,3 +201,28 @@ const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC,
return MCConstantExpr::create(0, Ctx);
}
+
+uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) {
+ if (!CodeSizeInBytes.has_value()) {
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+ const SIInstrInfo *TII = STM.getInstrInfo();
+
+ uint64_t CodeSize = 0;
+
+ for (const MachineBasicBlock &MBB : MF) {
+ for (const MachineInstr &MI : MBB) {
+ // TODO: CodeSize should account for multiple functions.
+
+ // TODO: Should we count size of debug info?
+ if (MI.isDebugInstr())
+ continue;
+
+ CodeSize += TII->getInstSizeInBytes(MI);
+ }
+ }
+
+ CodeSizeInBytes = CodeSize;
+ }
+
+ return *CodeSizeInBytes;
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index c358a2d9db10b..2836af033d4a4 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -19,6 +19,7 @@
#include "llvm/IR/CallingConv.h"
#include "llvm/Support/Compiler.h"
#include <cstdint>
+#include <optional>
namespace llvm {
@@ -29,6 +30,8 @@ class MachineFunction;
/// Track resource usage for kernels / entry functions.
struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
+ std::optional<uint64_t> CodeSizeInBytes;
+
// Fields set in PGM_RSRC1 pm4 packet.
const MCExpr *VGPRBlocks = nullptr;
const MCExpr *SGPRBlocks = nullptr;
@@ -60,7 +63,7 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
uint32_t LdsSize = 0;
uint32_t EXCPEnable = 0;
- const MCExpr *ComputePGMRSrc3GFX90A = nullptr;
+ const MCExpr *ComputePGMRSrc3 = nullptr;
const MCExpr *NumVGPR = nullptr;
const MCExpr *NumArchVGPR = nullptr;
@@ -97,6 +100,9 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
// non-MCExpr members.
void reset(const MachineFunction &MF);
+ // Get function code size and cache the value.
+ uint64_t getFunctionCodeSize(const MachineFunction &MF);
+
/// Compute the value of the ComputePGMRsrc1 register.
const MCExpr *getComputePGMRSrc1(const GCNSubtarget &ST,
MCContext &Ctx) const;
diff --git a/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll b/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll
new file mode 100644
index 0000000000000..671352446ca31
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 --amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 --amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX12 %s
+
+; GCN-LABEL: .amdhsa_kernel large
+; GFX11: .amdhsa_inst_pref_size 3
+; GFX11: codeLenInByte = 3{{[0-9][0-9]$}}
+; GFX12: .amdhsa_inst_pref_size 4
+; GFX12: codeLenInByte = 4{{[0-9][0-9]$}}
+define amdgpu_kernel void @large(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+bb:
+ call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 256, i1 false)
+ ret void
+}
+
+; GCN-LABEL: .amdhsa_kernel small
+; GCN: .amdhsa_inst_pref_size 1
+; GCN: codeLenInByte = {{[0-9]$}}
+define amdgpu_kernel void @small() {
+bb:
+ ret void
+}
More information about the llvm-commits
mailing list