[llvm-branch-commits] [llvm] 896e337 - Revert "[AMDGPU] Account for inline asm size in inst_pref_size calculation (#…"
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon May 11 17:05:45 PDT 2026
Author: theRonShark
Date: 2026-05-11T20:05:41-04:00
New Revision: 896e337bfc2b4029ca46cee456769eff7bc40ef0
URL: https://github.com/llvm/llvm-project/commit/896e337bfc2b4029ca46cee456769eff7bc40ef0
DIFF: https://github.com/llvm/llvm-project/commit/896e337bfc2b4029ca46cee456769eff7bc40ef0.diff
LOG: Revert "[AMDGPU] Account for inline asm size in inst_pref_size calculation (#…"
This reverts commit 7ddee0b619f658cef905a69427ef9531fd1d229d.
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
llvm/lib/Target/AMDGPU/GCNSubtarget.h
llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
llvm/lib/Target/AMDGPU/SIProgramInfo.h
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll
Removed:
llvm/test/CodeGen/AMDGPU/inst-prefetch-inline-asm.ll
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 3a2738d9fc498..ad61d8d084c7b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -234,18 +234,6 @@ void AMDGPUAsmPrinter::emitFunctionBodyStart() {
HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
}
-/// Set bits in a kernel descriptor MCExpr field:
-/// return ((Dst & ~Mask) | (Value << Shift))
-static const MCExpr *setBits(const MCExpr *Dst, const MCExpr *Value,
- uint32_t Mask, uint32_t Shift, MCContext &Ctx) {
- const auto *Shft = MCConstantExpr::create(Shift, Ctx);
- const auto *Msk = MCConstantExpr::create(Mask, Ctx);
- Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
- Dst = MCBinaryExpr::createOr(Dst, MCBinaryExpr::createShl(Value, Shft, Ctx),
- Ctx);
- return Dst;
-}
-
void AMDGPUAsmPrinter::endFunction(const MachineFunction *MF) {
const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
if (!MFI.isEntryFunction())
@@ -253,29 +241,6 @@ void AMDGPUAsmPrinter::endFunction(const MachineFunction *MF) {
assert(TM.getTargetTriple().getOS() == Triple::AMDHSA);
- const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
- MCContext &Ctx = MF->getContext();
-
- AMDGPU::MCKernelDescriptor KD =
- getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo);
-
- // Compute inst_pref_size using MCExpr label subtraction for exact code
- // size. At this point .Lfunc_end has been emitted (by the base AsmPrinter)
- // right after the function code, so (Lfunc_end - func_sym) gives the
- // exact function code size in bytes.
- if (STM.hasInstPrefSize()) {
- const MCExpr *CodeSizeExpr = MCBinaryExpr::createSub(
- MCSymbolRefExpr::create(getFunctionEnd(), OutContext),
- MCSymbolRefExpr::create(CurrentFnSym, OutContext), OutContext);
-
- uint32_t Mask, Shift, Width, CacheLineSize;
- STM.getInstPrefSizeArgs(Mask, Shift, Width, CacheLineSize);
- const MCExpr *InstPrefSize =
- AMDGPUMCExpr::createInstPrefSize(CodeSizeExpr, Ctx);
- KD.compute_pgm_rsrc3 =
- setBits(KD.compute_pgm_rsrc3, InstPrefSize, Mask, Shift, Ctx);
- }
-
auto &Streamer = getTargetStreamer()->getStreamer();
auto &Context = Streamer.getContext();
auto &ObjectFileInfo = *Context.getObjectFileInfo();
@@ -289,10 +254,13 @@ void AMDGPUAsmPrinter::endFunction(const MachineFunction *MF) {
Streamer.emitValueToAlignment(Align(64), 0, 1, 0);
ReadOnlySection.ensureMinAlignment(Align(64));
+ const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
+
SmallString<128> KernelName;
getNameWithPrefix(KernelName, &MF->getFunction());
getTargetStreamer()->EmitAmdhsaKernelDescriptor(
- STM, KernelName, KD, CurrentProgramInfo.NumVGPRsForWavesPerEU,
+ STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
+ CurrentProgramInfo.NumVGPRsForWavesPerEU,
MCBinaryExpr::createSub(
CurrentProgramInfo.NumSGPRsForWavesPerEU,
AMDGPUMCExpr::createExtraSGPRs(
@@ -1470,22 +1438,33 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
ProgInfo.EXCPEnable = 0;
+ // return ((Dst & ~Mask) | (Value << Shift))
+ auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
+ uint32_t Shift) {
+ const auto *Shft = MCConstantExpr::create(Shift, Ctx);
+ const auto *Msk = MCConstantExpr::create(Mask, Ctx);
+ Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
+ Dst = MCBinaryExpr::createOr(Dst, MCBinaryExpr::createShl(Value, Shft, Ctx),
+ Ctx);
+ return Dst;
+ };
+
if (STM.hasGFX90AInsts()) {
ProgInfo.ComputePGMRSrc3 =
- setBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset,
+ SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
- amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT, Ctx);
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
ProgInfo.ComputePGMRSrc3 =
- setBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit),
+ SetBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit),
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
- amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT, Ctx);
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
}
if (STM.hasGFX1250Insts())
ProgInfo.ComputePGMRSrc3 =
- setBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt,
+ SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt,
amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
- amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT, Ctx);
+ amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT);
ProgInfo.Occupancy = createOccupancy(
STM.computeOccupancy(F, ProgInfo.LDSSize).second,
@@ -1504,6 +1483,26 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
", final occupancy is " + Twine(Occupancy));
F.getContext().diagnose(Diag);
}
+
+ if (isGFX11Plus(STM)) {
+ uint32_t CodeSizeInBytes = (uint32_t)std::min(
+ ProgInfo.getFunctionCodeSize(MF, true /* IsLowerBound */),
+ (uint64_t)std::numeric_limits<uint32_t>::max());
+ uint32_t CodeSizeInLines = divideCeil(CodeSizeInBytes, 128);
+ uint32_t Field, Shift, Width;
+ if (isGFX11(STM)) {
+ Field = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE;
+ Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT;
+ Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
+ } else {
+ Field = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE;
+ Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT;
+ Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
+ }
+ uint64_t InstPrefSize = std::min(CodeSizeInLines, (1u << Width) - 1);
+ ProgInfo.ComputePGMRSrc3 = SetBits(ProgInfo.ComputePGMRSrc3,
+ CreateExpr(InstPrefSize), Field, Shift);
+ }
}
static unsigned getRsrcReg(CallingConv::ID CallConv) {
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 758e9b445d6dd..5f580ac0577d5 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -21,7 +21,6 @@
#include "SIISelLowering.h"
#include "SIInstrInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/Support/AMDHSAKernelDescriptor.h"
#include "llvm/Support/ErrorHandling.h"
#define GET_SUBTARGETINFO_HEADER
@@ -426,23 +425,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasPrefetch() const { return HasGFX12Insts; }
- bool hasInstPrefSize() const { return isGFX11Plus(); }
-
- void getInstPrefSizeArgs(uint32_t &Mask, uint32_t &Shift, uint32_t &Width,
- uint32_t &CacheLineSize) const {
- assert(isGFX11Plus());
- CacheLineSize = getInstCacheLineSize();
- if (getGeneration() == GFX11) {
- Mask = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE;
- Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT;
- Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
- } else {
- Mask = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE;
- Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT;
- Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
- }
- }
-
// Has s_cmpk_* instructions.
bool hasSCmpK() const { return getGeneration() < GFX12; }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
index 4563803ad6577..fd0a2a6a77d7e 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
@@ -12,12 +12,9 @@
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/MC/MCValue.h"
-#include "llvm/Support/AMDHSAKernelDescriptor.h"
#include "llvm/Support/KnownBits.h"
-#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include <functional>
#include <optional>
@@ -77,9 +74,6 @@ void AMDGPUMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
case AGVK_Occupancy:
OS << "occupancy(";
break;
- case AGVK_InstPrefSize:
- OS << "instprefsize(";
- break;
case AGVK_Lit:
OS << "lit(";
break;
@@ -188,27 +182,6 @@ bool AMDGPUMCExpr::evaluateOccupancy(MCValue &Res,
return true;
}
-/// Get the inst_pref_size field width for the given subtarget.
-static unsigned getInstPrefSizeFieldWidth(const MCSubtargetInfo &STI) {
- if (AMDGPU::isGFX12Plus(STI))
- return amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
- return amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
-}
-
-bool AMDGPUMCExpr::evaluateInstPrefSize(MCValue &Res,
- const MCAssembler *Asm) const {
- uint64_t CodeSizeInBytes = 0;
- if (!evaluateMCExprs(Args, Asm, {CodeSizeInBytes}))
- return false;
- const MCSubtargetInfo *STI = Ctx.getSubtargetInfo();
- unsigned FieldWidth = getInstPrefSizeFieldWidth(*STI);
- unsigned CacheLineSize = AMDGPU::IsaInfo::getInstCacheLineSize(STI);
- uint64_t CodeSizeInLines = divideCeil(CodeSizeInBytes, CacheLineSize);
- uint64_t MaxVal = (1u << FieldWidth) - 1;
- Res = MCValue::get(std::min(CodeSizeInLines, MaxVal));
- return true;
-}
-
bool AMDGPUMCExpr::isSymbolUsedInExpression(const MCSymbol *Sym,
const MCExpr *E) {
switch (E->getKind()) {
@@ -254,8 +227,6 @@ bool AMDGPUMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
return evaluateTotalNumVGPR(Res, Asm);
case AGVK_Occupancy:
return evaluateOccupancy(Res, Asm);
- case AGVK_InstPrefSize:
- return evaluateInstPrefSize(Res, Asm);
case AGVK_Lit:
case AGVK_Lit64:
return Args[0]->evaluateAsRelocatable(Res, Asm);
@@ -308,11 +279,6 @@ const AMDGPUMCExpr *AMDGPUMCExpr::createTotalNumVGPR(const MCExpr *NumAGPR,
return create(AGVK_TotalNumVGPRs, {NumAGPR, NumVGPR}, Ctx);
}
-const AMDGPUMCExpr *
-AMDGPUMCExpr::createInstPrefSize(const MCExpr *CodeSizeBytes, MCContext &Ctx) {
- return create(AGVK_InstPrefSize, {CodeSizeBytes}, Ctx);
-}
-
const AMDGPUMCExpr *AMDGPUMCExpr::createLit(LitModifier Lit, int64_t Value,
MCContext &Ctx) {
assert(Lit == LitModifier::Lit || Lit == LitModifier::Lit64);
@@ -503,7 +469,6 @@ static void targetOpKnownBitsMapHelper(const MCExpr *Expr, KnownBitsMap &KBM,
case AMDGPUMCExpr::VariantKind::AGVK_TotalNumVGPRs:
case AMDGPUMCExpr::VariantKind::AGVK_AlignTo:
case AMDGPUMCExpr::VariantKind::AGVK_Occupancy:
- case AMDGPUMCExpr::VariantKind::AGVK_InstPrefSize:
case AMDGPUMCExpr::VariantKind::AGVK_Lit:
case AMDGPUMCExpr::VariantKind::AGVK_Lit64: {
int64_t Val;
@@ -512,16 +477,6 @@ static void targetOpKnownBitsMapHelper(const MCExpr *Expr, KnownBitsMap &KBM,
KBM[Expr] = KnownBits::makeConstant(APValue);
return;
}
- if (AGVK->getKind() == AMDGPUMCExpr::VariantKind::AGVK_InstPrefSize) {
- // The result is clamped to (1 << FieldWidth) - 1, so upper bits are
- // known zero. FieldWidth is derived from the subtarget.
- const MCSubtargetInfo *STI = AGVK->getCtx().getSubtargetInfo();
- unsigned FieldWidth = getInstPrefSizeFieldWidth(*STI);
- KnownBits KB(BitWidth);
- KB.Zero.setBitsFrom(FieldWidth);
- KBM[Expr] = KB;
- return;
- }
KBM[Expr] = KnownBits(BitWidth);
return;
}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
index 4b1aa0c591a80..96bd8f4cf3c13 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
@@ -38,7 +38,6 @@ class AMDGPUMCExpr : public MCTargetExpr {
AGVK_TotalNumVGPRs,
AGVK_AlignTo,
AGVK_Occupancy,
- AGVK_InstPrefSize,
AGVK_Lit,
AGVK_Lit64,
};
@@ -70,7 +69,6 @@ class AMDGPUMCExpr : public MCTargetExpr {
bool evaluateTotalNumVGPR(MCValue &Res, const MCAssembler *Asm) const;
bool evaluateAlignTo(MCValue &Res, const MCAssembler *Asm) const;
bool evaluateOccupancy(MCValue &Res, const MCAssembler *Asm) const;
- bool evaluateInstPrefSize(MCValue &Res, const MCAssembler *Asm) const;
public:
static const AMDGPUMCExpr *
@@ -99,18 +97,11 @@ class AMDGPUMCExpr : public MCTargetExpr {
return create(VariantKind::AGVK_AlignTo, {Value, Align}, Ctx);
}
- /// Create an expression for instruction prefetch size computation:
- /// min(divideCeil(CodeSizeBytes, CacheLineSize), (1 << FieldWidth) - 1)
- /// FieldWidth and CacheLineSize are derived from the subtarget.
- static const AMDGPUMCExpr *createInstPrefSize(const MCExpr *CodeSizeBytes,
- MCContext &Ctx);
-
static const AMDGPUMCExpr *createLit(LitModifier Lit, int64_t Value,
MCContext &Ctx);
ArrayRef<const MCExpr *> getArgs() const { return Args; }
VariantKind getKind() const { return Kind; }
- MCContext &getCtx() const { return Ctx; }
const MCExpr *getSubExpr(size_t Index) const;
void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
index 27cef7a1b9158..99255e4060886 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
@@ -215,8 +215,9 @@ const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC,
return MCConstantExpr::create(0, Ctx);
}
-uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) {
- if (CodeSizeInBytes.has_value())
+uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF,
+ bool IsLowerBound) {
+ if (!IsLowerBound && CodeSizeInBytes.has_value())
return *CodeSizeInBytes;
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
@@ -225,7 +226,12 @@ uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) {
uint64_t CodeSize = 0;
for (const MachineBasicBlock &MBB : MF) {
- CodeSize = alignTo(CodeSize, MBB.getAlignment());
+ // The amount of padding to align code can be both underestimated and
+ // overestimated. In case of inline asm used getInstSizeInBytes() will
+ // return a maximum size of a single instruction, where the real size may
+ //
diff er. At this point CodeSize may be already off.
+ if (!IsLowerBound)
+ CodeSize = alignTo(CodeSize, MBB.getAlignment());
for (const MachineInstr &MI : MBB) {
// TODO: CodeSize should account for multiple functions.
@@ -233,6 +239,11 @@ uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) {
if (MI.isMetaInstruction())
continue;
+ // We cannot properly estimate inline asm size. It can be as small as zero
+ // if that is just a comment.
+ if (IsLowerBound && MI.isInlineAsm())
+ continue;
+
CodeSize += TII->getInstSizeInBytes(MI);
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index fb56ebf88c96f..947b473142a1f 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -105,7 +105,10 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
void reset(const MachineFunction &MF);
// Get function code size and cache the value.
- uint64_t getFunctionCodeSize(const MachineFunction &MF);
+ // If \p IsLowerBound is set it returns a minimal code size which is safe
+ // to address.
+ uint64_t getFunctionCodeSize(const MachineFunction &MF,
+ bool IsLowerBound = false);
/// Compute the value of the ComputePGMRsrc1 register.
const MCExpr *getComputePGMRSrc1(const GCNSubtarget &ST,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index dd67e77d0d9ed..b13aed2432602 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1182,14 +1182,6 @@ std::string AMDGPUTargetID::toString() const {
return Str;
}
-unsigned getInstCacheLineSize(const MCSubtargetInfo *STI) {
- if (STI->getFeatureBits().test(FeatureInstCacheLineSize128))
- return 128;
- if (STI->getFeatureBits().test(FeatureInstCacheLineSize64))
- return 64;
- return 64;
-}
-
unsigned getWavefrontSize(const MCSubtargetInfo *STI) {
if (STI->getFeatureBits().test(FeatureWavefrontSize16))
return 16;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index e1b36f0996331..49373f09ee460 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -233,9 +233,6 @@ inline raw_ostream &operator<<(raw_ostream &OS,
return OS;
}
-/// \returns Instruction cache line size in bytes for given subtarget \p STI.
-unsigned getInstCacheLineSize(const MCSubtargetInfo *STI);
-
/// \returns Wavefront size for given subtarget \p STI.
unsigned getWavefrontSize(const MCSubtargetInfo *STI);
diff --git a/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll b/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll
index b76ef7eac11c4..580167076e1f0 100644
--- a/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll
+++ b/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll
@@ -1,31 +1,11 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 --amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX11 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 --amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX12 %s
-;; Verify that inst_pref_size resolves to the correct value in the object file.
-;; COMPUTE_PGM_RSRC3 is at offset 0x2C in each 64-byte kernel descriptor.
-;; inst_pref_size is bits [9:4] on GFX11 (6-bit) and bits [11:4] on GFX12+ (8-bit).
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 --amdgpu-memcpy-loop-unroll=100000 -filetype=obj < %s -o %t.gfx11.o
-; RUN: llvm-objdump -s -j .rodata %t.gfx11.o | FileCheck --check-prefix=OBJ-GFX11 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 --amdgpu-memcpy-loop-unroll=100000 -filetype=obj < %s -o %t.gfx12.o
-; RUN: llvm-objdump -s -j .rodata %t.gfx12.o | FileCheck --check-prefix=OBJ-GFX12 %s
-
-; The inst_pref_size is computed via MCExpr label subtraction, resolved at
-; assembly/link time. In text output it appears as:
-; ((instprefsize(<code_size>)<<Shift)&Mask)>>Shift
-; where:
-; <code_size> = .Lfunc_endN - func_sym (exact function code size in bytes)
-; instprefsize = min(divideCeil(code_size, cache_line_size), (1 << field_width) - 1)
-; field_width and cache_line_size are derived from the subtarget
-
; GCN-LABEL: .amdhsa_kernel large
-; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end0-large)<<4)&1008)>>4
-; GFX11: codeLenInByte = {{[0-9]+}}
-; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end0-large)<<4)&4080)>>4
-; GFX12: codeLenInByte = {{[0-9]+}}
-;; Object: kernel descriptor at 0x00, COMPUTE_PGM_RSRC3 at 0x2C:
-;; gfx11 pref=3 (0x30), gfx12 pref=4 (0x40)
-; OBJ-GFX11: 0020 {{.*}}30000000
-; OBJ-GFX12: 0020 {{.*}}40000000
+; GFX11: .amdhsa_inst_pref_size 3
+; GFX11: codeLenInByte = 3{{[0-9][0-9]$}}
+; GFX12: .amdhsa_inst_pref_size 4
+; GFX12: codeLenInByte = 4{{[0-9][0-9]$}}
define amdgpu_kernel void @large(ptr addrspace(1) %out, ptr addrspace(1) %in) {
bb:
call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 256, i1 false)
@@ -33,30 +13,18 @@ bb:
}
; GCN-LABEL: .amdhsa_kernel small
-; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end1-small)<<4)&1008)>>4
-; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end1-small)<<4)&4080)>>4
-; GCN: codeLenInByte = {{[0-9]+}}
-;; Object: kernel descriptor at 0x40, COMPUTE_PGM_RSRC3 at 0x6C:
-;; pref=1 (0x10) for both
-; OBJ-GFX11: 0060 {{.*}}10000000
-; OBJ-GFX12: 0060 {{.*}}10000000
+; GCN: .amdhsa_inst_pref_size 1
+; GCN: codeLenInByte = {{[0-9]$}}
define amdgpu_kernel void @small() {
bb:
ret void
}
-; Inline asm is accounted for via MCExpr label subtraction (exact code size).
-; The MCExpr resolves to the correct inst_pref_size at assembly time.
+; Ignore inline asm in size calculation
; GCN-LABEL: .amdhsa_kernel inline_asm
-; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end2-inline_asm)<<4)&1008)>>4
-; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end2-inline_asm)<<4)&4080)>>4
-; GCN: codeLenInByte = {{[0-9]+}}
-;; Object: kernel descriptor at 0x80, COMPUTE_PGM_RSRC3 at 0xAC:
-;; pref=9 (0x90) for both
-;; (.fill 256, 4, 0 = 1024 bytes + 4 s_endpgm = 1028 -> divideCeil(1028,128) = 9)
-; OBJ-GFX11: 00a0 {{.*}}90000000
-; OBJ-GFX12: 00a0 {{.*}}90000000
+; GCN: .amdhsa_inst_pref_size 1
+; GCN: codeLenInByte = {{[0-9]$}}
define amdgpu_kernel void @inline_asm() {
bb:
call void asm sideeffect ".fill 256, 4, 0", ""()
diff --git a/llvm/test/CodeGen/AMDGPU/inst-prefetch-inline-asm.ll b/llvm/test/CodeGen/AMDGPU/inst-prefetch-inline-asm.ll
deleted file mode 100644
index 287a30032230b..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/inst-prefetch-inline-asm.ll
+++ /dev/null
@@ -1,154 +0,0 @@
-;; Verify that inline assembly is correctly accounted for in the
-;; inst_pref_size calculation. The inst_pref_size is computed via MCExpr
-;; label subtraction (.Lfunc_end - func_sym), giving exact code size.
-;; See inst-prefetch-hint.ll for explanation of the instprefsize expression.
-
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -filetype=obj < %s -o %t.gfx11.o
-; RUN: llvm-objdump -s -j .rodata %t.gfx11.o | FileCheck --check-prefix=OBJ-GFX11 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -filetype=obj < %s -o %t.gfx12.o
-; RUN: llvm-objdump -s -j .rodata %t.gfx12.o | FileCheck --check-prefix=OBJ-GFX12 %s
-
-;; --- .fill directive: .fill 256, 4, 0 => 1024 bytes + 4 (s_endpgm) = 1028 ---
-;; pref_size = divideCeil(1028, 128) = 9
-
-; GFX11-LABEL: .amdhsa_kernel test_fill
-; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end0-test_fill)<<4)&1008)>>4
-; GFX12-LABEL: .amdhsa_kernel test_fill
-; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end0-test_fill)<<4)&4080)>>4
-;; Object: kernel descriptor at 0x00, COMPUTE_PGM_RSRC3 at 0x2C:
-;; pref_size=9 -> 9<<4 = 0x90
-; OBJ-GFX11: 0020 {{.*}}90000000
-; OBJ-GFX12: 0020 {{.*}}90000000
-
-define amdgpu_kernel void @test_fill() {
- call void asm sideeffect ".fill 256, 4, 0", ""()
- ret void
-}
-
-;; --- .space directive: .space 1024 => 1024 bytes + 4 = 1028 ---
-;; pref_size = 9
-
-; GFX11-LABEL: .amdhsa_kernel test_space
-; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end1-test_space)<<4)&1008)>>4
-; GFX12-LABEL: .amdhsa_kernel test_space
-; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end1-test_space)<<4)&4080)>>4
-;; Object: kernel descriptor at 0x40, COMPUTE_PGM_RSRC3 at 0x6C:
-;; pref_size=9 -> 9<<4 = 0x90
-; OBJ-GFX11: 0060 {{.*}}90000000
-; OBJ-GFX12: 0060 {{.*}}90000000
-
-define amdgpu_kernel void @test_space() {
- call void asm sideeffect ".space 1024", ""()
- ret void
-}
-
-;; --- Instructions: 32 x s_nop (4 bytes each) = 128 + 4 = 132 ---
-;; pref_size = divideCeil(132, 128) = 2
-
-; GFX11-LABEL: .amdhsa_kernel test_instructions
-; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end2-test_instructions)<<4)&1008)>>4
-; GFX12-LABEL: .amdhsa_kernel test_instructions
-; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end2-test_instructions)<<4)&4080)>>4
-;; Object: kernel descriptor at 0x80, COMPUTE_PGM_RSRC3 at 0xAC:
-;; pref_size=2 -> 2<<4 = 0x20
-; OBJ-GFX11: 00a0 {{.*}}20000000
-; OBJ-GFX12: 00a0 {{.*}}20000000
-
-define amdgpu_kernel void @test_instructions() {
- call void asm sideeffect "s_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0", ""()
- ret void
-}
-
-;; --- Comments emit no bytes: only s_endpgm = 4 bytes ---
-;; pref_size = 1
-
-; GFX11-LABEL: .amdhsa_kernel test_comments
-; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end3-test_comments)<<4)&1008)>>4
-; GFX12-LABEL: .amdhsa_kernel test_comments
-; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end3-test_comments)<<4)&4080)>>4
-;; Object: kernel descriptor at 0xC0, COMPUTE_PGM_RSRC3 at 0xEC:
-;; pref_size=1 -> 1<<4 = 0x10
-; OBJ-GFX11: 00e0 {{.*}}10000000
-; OBJ-GFX12: 00e0 {{.*}}10000000
-
-define amdgpu_kernel void @test_comments() {
- call void asm sideeffect "; comment 1\0A; comment 2\0A; comment 3", ""()
- ret void
-}
-
-;; --- Empty inline asm: only s_endpgm = 4 bytes ---
-;; pref_size = 1
-
-; GFX11-LABEL: .amdhsa_kernel test_empty_asm
-; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end4-test_empty_asm)<<4)&1008)>>4
-; GFX12-LABEL: .amdhsa_kernel test_empty_asm
-; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end4-test_empty_asm)<<4)&4080)>>4
-;; Object: kernel descriptor at 0x100, COMPUTE_PGM_RSRC3 at 0x12C:
-;; pref_size=1 -> 1<<4 = 0x10
-; OBJ-GFX11: 0120 {{.*}}10000000
-; OBJ-GFX12: 0120 {{.*}}10000000
-
-define amdgpu_kernel void @test_empty_asm() {
- call void asm sideeffect "", ""()
- ret void
-}
-
-;; --- Multiple inline asm blocks: .fill (512) + .space (512) + s_endpgm (4) = 1028 ---
-;; pref_size = divideCeil(1028, 128) = 9
-
-; GFX11-LABEL: .amdhsa_kernel test_multiple_asm
-; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end5-test_multiple_asm)<<4)&1008)>>4
-; GFX12-LABEL: .amdhsa_kernel test_multiple_asm
-; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end5-test_multiple_asm)<<4)&4080)>>4
-;; Object: kernel descriptor at 0x140, COMPUTE_PGM_RSRC3 at 0x16C:
-;; pref_size=9 -> 9<<4 = 0x90
-; OBJ-GFX11: 0160 {{.*}}90000000
-; OBJ-GFX12: 0160 {{.*}}90000000
-
-define amdgpu_kernel void @test_multiple_asm() {
- call void asm sideeffect ".fill 128, 4, 0", ""()
- call void asm sideeffect ".space 512", ""()
- ret void
-}
-
-;; --- Large function that exceeds GFX11 6-bit field max (63) ---
-;; .fill 2048, 4, 0 = 8192 bytes + 4 = 8196 bytes
-;; divideCeil(8196, 128) = 65, but GFX11 max = (1<<6)-1 = 63
-;; pref_size should clamp to 63
-
-; GFX11-LABEL: .amdhsa_kernel test_clamping
-; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end6-test_clamping)<<4)&1008)>>4
-; GFX12-LABEL: .amdhsa_kernel test_clamping
-; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end6-test_clamping)<<4)&4080)>>4
-;; Object: kernel descriptor at 0x180, COMPUTE_PGM_RSRC3 at 0x1AC:
-;; gfx11: clamped to 63 -> 63<<4 = 0x3F0
-;; gfx12: no clamping, 65 -> 65<<4 = 0x410
-; OBJ-GFX11: 01a0 {{.*}}f0030000
-; OBJ-GFX12: 01a0 {{.*}}10040000
-
-define amdgpu_kernel void @test_clamping() {
- call void asm sideeffect ".fill 2048, 4, 0", ""()
- ret void
-}
-
-;; --- Large function that exceeds both GFX11 and GFX12 field max ---
-;; .fill 8192, 4, 0 = 32768 bytes + 4 = 32772 bytes
-;; divideCeil(32772, 128) = 257
-;; GFX11 max = 63, GFX12 max = 255 -> both clamp
-
-; GFX11-LABEL: .amdhsa_kernel test_clamping_both
-; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end7-test_clamping_both)<<4)&1008)>>4
-; GFX12-LABEL: .amdhsa_kernel test_clamping_both
-; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end7-test_clamping_both)<<4)&4080)>>4
-;; Object: kernel descriptor at 0x1C0, COMPUTE_PGM_RSRC3 at 0x1EC:
-;; gfx11: clamped to 63 -> 63<<4 = 0x3F0
-;; gfx12: clamped to 255 -> 255<<4 = 0xFF0
-; OBJ-GFX11: 01e0 {{.*}}f0030000
-; OBJ-GFX12: 01e0 {{.*}}f00f0000
-
-define amdgpu_kernel void @test_clamping_both() {
- call void asm sideeffect ".fill 8192, 4, 0", ""()
- ret void
-}
More information about the llvm-branch-commits
mailing list