[llvm] [AMDGPU] upstream barrier count reporting part1 (PR #154409)
Gang Chen via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 19 15:56:21 PDT 2025
https://github.com/cmc-rep updated https://github.com/llvm/llvm-project/pull/154409
>From 0661368f13eadd914bd7085be90e3e3a8ef5d575 Mon Sep 17 00:00:00 2001
From: Gang Chen <Gang.Chen at amd.com>
Date: Mon, 18 Aug 2025 15:33:34 -0700
Subject: [PATCH 1/3] [AMDGPU] upstream barrier count reporting part1
---
.../llvm/Support/AMDHSAKernelDescriptor.h | 10 ++++++++-
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 19 ++++++++++++++++
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 7 ++++++
.../Target/AMDGPU/AMDGPUMCResourceInfo.cpp | 15 +++++++++++++
llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h | 6 +++++
.../Target/AMDGPU/AMDGPUMachineFunction.cpp | 2 ++
.../lib/Target/AMDGPU/AMDGPUMachineFunction.h | 8 +++++++
.../AMDGPU/AMDGPUResourceUsageAnalysis.cpp | 2 ++
.../AMDGPU/AMDGPUResourceUsageAnalysis.h | 1 +
.../Disassembler/AMDGPUDisassembler.cpp | 14 ++++++++++--
.../MCTargetDesc/AMDGPUTargetStreamer.cpp | 9 ++++----
.../MCTargetDesc/AMDGPUTargetStreamer.h | 22 +++++++++----------
llvm/lib/Target/AMDGPU/SIProgramInfo.cpp | 1 +
llvm/lib/Target/AMDGPU/SIProgramInfo.h | 3 +++
.../test/CodeGen/AMDGPU/s-barrier-lowering.ll | 5 +++++
15 files changed, 105 insertions(+), 19 deletions(-)
diff --git a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
index 8f367390c531c..4fb6fa656b6e7 100644
--- a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
+++ b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
@@ -186,6 +186,9 @@ enum : int32_t {
// [GFX10-GFX11].
#define COMPUTE_PGM_RSRC3_GFX10_GFX11(NAME, SHIFT, WIDTH) \
AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_GFX11_##NAME, SHIFT, WIDTH)
+// [GFX10-GFX120].
+#define COMPUTE_PGM_RSRC3_GFX10_GFX120(NAME, SHIFT, WIDTH) \
+ AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_GFX120_ ## NAME, SHIFT, WIDTH)
// GFX11+.
#define COMPUTE_PGM_RSRC3_GFX11_PLUS(NAME, SHIFT, WIDTH) \
AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX11_PLUS_ ## NAME, SHIFT, WIDTH)
@@ -195,6 +198,9 @@ enum : int32_t {
// GFX12+.
#define COMPUTE_PGM_RSRC3_GFX12_PLUS(NAME, SHIFT, WIDTH) \
AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX12_PLUS_##NAME, SHIFT, WIDTH)
+// [GFX125].
+#define COMPUTE_PGM_RSRC3_GFX125(NAME, SHIFT, WIDTH) \
+ AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX125_##NAME, SHIFT, WIDTH)
enum : int32_t {
COMPUTE_PGM_RSRC3_GFX10_GFX11(SHARED_VGPR_COUNT, 0, 4),
COMPUTE_PGM_RSRC3_GFX12_PLUS(RESERVED0, 0, 4),
@@ -206,7 +212,9 @@ enum : int32_t {
COMPUTE_PGM_RSRC3_GFX10_PLUS(RESERVED2, 12, 1),
COMPUTE_PGM_RSRC3_GFX10_GFX11(RESERVED3, 13, 1),
COMPUTE_PGM_RSRC3_GFX12_PLUS(GLG_EN, 13, 1),
- COMPUTE_PGM_RSRC3_GFX10_PLUS(RESERVED4, 14, 17),
+ COMPUTE_PGM_RSRC3_GFX10_GFX120(RESERVED4, 14, 3),
+ COMPUTE_PGM_RSRC3_GFX125(NAMED_BAR_CNT, 14, 3),
+ COMPUTE_PGM_RSRC3_GFX10_PLUS(RESERVED5, 17, 14),
COMPUTE_PGM_RSRC3_GFX10(RESERVED5, 31, 1),
COMPUTE_PGM_RSRC3_GFX11_PLUS(IMAGE_OP, 31, 1),
};
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 66c3fad6de1a1..69722bdf0a49b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -720,6 +720,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
IsLocal),
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumSGPR, OutContext,
IsLocal),
+ RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumNamedBarrier,
+ OutContext, IsLocal),
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
OutContext, IsLocal),
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesVCC, OutContext,
@@ -807,6 +809,16 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
" AccumOffset: " + getMCExprStr(AdjustedAccum), false);
}
+ if (AMDGPU::isGFX1250(STM)) {
+ const MCExpr *BarBlkConst = MCConstantExpr::create(4, Ctx);
+ const MCExpr *AlignToBlk = AMDGPUMCExpr::createAlignTo(
+ CurrentProgramInfo.NamedBarCnt, BarBlkConst, Ctx);
+ const MCExpr *BarBlks =
+ MCBinaryExpr::createDiv(AlignToBlk, BarBlkConst, Ctx);
+ OutStreamer->emitRawComment(" NamedBarCnt: " + getMCExprStr(BarBlks),
+ false);
+ }
+
OutStreamer->emitRawComment(
" Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false);
@@ -1011,6 +1023,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.DynamicCallStack =
MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack),
GetSymRefExpr(RIK::RIK_HasRecursion), Ctx);
+ ProgInfo.NamedBarCnt = GetSymRefExpr(RIK::RIK_NumNamedBarrier);
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -1253,6 +1266,12 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
}
+ if (AMDGPU::isGFX1250(STM))
+ ProgInfo.ComputePGMRSrc3 =
+ SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT);
+
ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy(
STM.computeOccupancy(F, ProgInfo.LDSSize).second,
ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 64e68ab7d753c..6c0192d68690e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1512,9 +1512,16 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
const GlobalValue *GV = G->getGlobal();
if (!MFI->isModuleEntryFunction()) {
+ auto IsNamedBarrier = AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV));
if (std::optional<uint32_t> Address =
AMDGPUMachineFunction::getLDSAbsoluteAddress(*GV)) {
+ if (IsNamedBarrier) {
+ unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
+ MFI->recordNumNamedBarriers(Address.value(), BarCnt);
+ }
return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
+ } else if (IsNamedBarrier) {
+ llvm_unreachable("named barrier should have an assigned address");
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
index 6390853d53b56..6b3cdf57f3ad6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
@@ -39,6 +39,8 @@ MCSymbol *MCResourceInfo::getSymbol(StringRef FuncName, ResourceInfoKind RIK,
return GOCS(".num_agpr");
case RIK_NumSGPR:
return GOCS(".numbered_sgpr");
+ case RIK_NumNamedBarrier:
+ return GOCS(".num_named_barrier");
case RIK_PrivateSegSize:
return GOCS(".private_seg_size");
case RIK_UsesVCC:
@@ -66,6 +68,7 @@ void MCResourceInfo::assignMaxRegs(MCContext &OutContext) {
MCSymbol *MaxVGPRSym = getMaxVGPRSymbol(OutContext);
MCSymbol *MaxAGPRSym = getMaxAGPRSymbol(OutContext);
MCSymbol *MaxSGPRSym = getMaxSGPRSymbol(OutContext);
+ MCSymbol *MaxNamedBarrierSym = getMaxNamedBarrierSymbol(OutContext);
auto assignMaxRegSym = [&OutContext](MCSymbol *Sym, int32_t RegCount) {
const MCExpr *MaxExpr = MCConstantExpr::create(RegCount, OutContext);
@@ -75,6 +78,7 @@ void MCResourceInfo::assignMaxRegs(MCContext &OutContext) {
assignMaxRegSym(MaxVGPRSym, MaxVGPR);
assignMaxRegSym(MaxAGPRSym, MaxAGPR);
assignMaxRegSym(MaxSGPRSym, MaxSGPR);
+ assignMaxRegSym(MaxNamedBarrierSym, MaxNamedBarrier);
}
void MCResourceInfo::reset() { *this = MCResourceInfo(); }
@@ -97,6 +101,10 @@ MCSymbol *MCResourceInfo::getMaxSGPRSymbol(MCContext &OutContext) {
return OutContext.getOrCreateSymbol("amdgpu.max_num_sgpr");
}
+MCSymbol *MCResourceInfo::getMaxNamedBarrierSymbol(MCContext &OutContext) {
+ return OutContext.getOrCreateSymbol("amdgpu.max_num_named_barrier");
+}
+
// Tries to flatten recursive call register resource gathering. Simple cycle
// avoiding dfs to find the constants in the propagated symbols.
// Assumes:
@@ -227,6 +235,10 @@ void MCResourceInfo::assignResourceInfoExpr(
case RIK_NumAGPR:
ArgExprs.push_back(flattenedCycleMax(CalleeValSym, RIK, OutContext));
break;
+ case RIK_NumNamedBarrier:
+ ArgExprs.push_back(MCSymbolRefExpr::create(
+ getMaxNamedBarrierSymbol(OutContext), OutContext));
+ break;
}
}
}
@@ -245,11 +257,13 @@ void MCResourceInfo::gatherResourceInfo(
MCSymbol *MaxAGPRSym = getMaxAGPRSymbol(OutContext);
MCSymbol *MaxSGPRSym = getMaxSGPRSymbol(OutContext);
bool IsLocal = MF.getFunction().hasLocalLinkage();
+ MCSymbol *MaxNamedBarrierSym = getMaxNamedBarrierSymbol(OutContext);
if (!AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())) {
addMaxVGPRCandidate(FRI.NumVGPR);
addMaxAGPRCandidate(FRI.NumAGPR);
addMaxSGPRCandidate(FRI.NumExplicitSGPR);
+ addMaxNamedBarrierCandidate(FRI.NumNamedBarrier);
}
const TargetMachine &TM = MF.getTarget();
@@ -288,6 +302,7 @@ void MCResourceInfo::gatherResourceInfo(
SetMaxReg(MaxVGPRSym, FRI.NumVGPR, RIK_NumVGPR);
SetMaxReg(MaxAGPRSym, FRI.NumAGPR, RIK_NumAGPR);
SetMaxReg(MaxSGPRSym, FRI.NumExplicitSGPR, RIK_NumSGPR);
+ SetMaxReg(MaxNamedBarrierSym, FRI.NumNamedBarrier, RIK_NumNamedBarrier);
{
// The expression for private segment size should be: FRI.PrivateSegmentSize
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
index 297e93bb54e29..b6055166698b3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
@@ -31,6 +31,7 @@ class MCResourceInfo {
RIK_NumVGPR,
RIK_NumAGPR,
RIK_NumSGPR,
+ RIK_NumNamedBarrier,
RIK_PrivateSegSize,
RIK_UsesVCC,
RIK_UsesFlatScratch,
@@ -43,6 +44,7 @@ class MCResourceInfo {
int32_t MaxVGPR = 0;
int32_t MaxAGPR = 0;
int32_t MaxSGPR = 0;
+ int32_t MaxNamedBarrier = 0;
// Whether the MCResourceInfo has been finalized through finalize(MCContext
// &). Should only be called once, at the end of AsmPrinting to assign MaxXGPR
@@ -75,6 +77,9 @@ class MCResourceInfo {
void addMaxSGPRCandidate(int32_t candidate) {
MaxSGPR = std::max(MaxSGPR, candidate);
}
+ void addMaxNamedBarrierCandidate(int32_t candidate) {
+ MaxNamedBarrier = std::max(MaxNamedBarrier, candidate);
+ }
MCSymbol *getSymbol(StringRef FuncName, ResourceInfoKind RIK,
MCContext &OutContext, bool IsLocal);
@@ -90,6 +95,7 @@ class MCResourceInfo {
MCSymbol *getMaxVGPRSymbol(MCContext &OutContext);
MCSymbol *getMaxAGPRSymbol(MCContext &OutContext);
MCSymbol *getMaxSGPRSymbol(MCContext &OutContext);
+ MCSymbol *getMaxNamedBarrierSymbol(MCContext &OutContext);
/// AMDGPUResourceUsageAnalysis gathers resource usage on a per-function
/// granularity. However, some resource info has to be assigned the call
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 0c82caced3c9b..664a15ca55f53 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -107,6 +107,8 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
if (!BarAddr)
llvm_unreachable("named barrier should have an assigned address");
Entry.first->second = BarAddr.value();
+ unsigned BarCnt = DL.getTypeAllocSize(GV.getValueType()) / 16;
+ recordNumNamedBarriers(BarAddr.value(), BarCnt);
return BarAddr.value();
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index b1022e48b8d34..fc64e16ffbeb8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -49,6 +49,8 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
// Flag to check dynamic LDS usage by kernel.
bool UsesDynamicLDS = false;
+ uint32_t NumNamedBarriers = 0;
+
// Kernels + shaders. i.e. functions called by the hardware and not called
// by other functions.
bool IsEntryFunction = false;
@@ -86,6 +88,12 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
return GDSSize;
}
+ void recordNumNamedBarriers(uint32_t GVAddr, unsigned BarCnt) {
+ NumNamedBarriers =
+ std::max(NumNamedBarriers, ((GVAddr & 0x1ff) >> 4) + BarCnt - 1);
+ }
+ uint32_t getNumNamedBarriers() const { return NumNamedBarriers; }
+
bool isEntryFunction() const {
return IsEntryFunction;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index ccd2de18979d1..0ea9add891111 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -142,6 +142,8 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
MRI.isLiveIn(MFI->getPreloadedReg(
AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
+ Info.NumNamedBarrier = MFI->getNumNamedBarriers();
+
// Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
// instructions aren't used to access the scratch buffer. Inline assembly may
// need it though.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
index acfff960d3f58..9ae3bb3ca048e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
@@ -35,6 +35,7 @@ struct AMDGPUResourceUsageAnalysisImpl {
int32_t NumVGPR = 0;
int32_t NumAGPR = 0;
int32_t NumExplicitSGPR = 0;
+ int32_t NumNamedBarrier = 0;
uint64_t CalleeSegmentSize = 0;
uint64_t PrivateSegmentSize = 0;
bool UsesVCC = false;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index fb7d634e62272..070de008d4f59 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -2422,8 +2422,18 @@ Expected<bool> AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC3(
"must be zero on gfx10 or gfx11");
}
- // Bits [14-30].
- CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED4,
+ // Bits [14-16]
+ if (isGFX1250()) {
+ PRINT_DIRECTIVE(".amdhsa_named_barrier_count",
+ COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT);
+ } else {
+ CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_GFX120_RESERVED4,
+ "COMPUTE_PGM_RSRC3",
+ "must be zero on gfx10+");
+ }
+
+ // Bits [17-30].
+ CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED5,
"COMPUTE_PGM_RSRC3", "must be zero on gfx10+");
// Bits [31].
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index e20581d76fcde..197de1228a29e 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -276,10 +276,10 @@ void AMDGPUTargetAsmStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
void AMDGPUTargetAsmStreamer::EmitMCResourceInfo(
const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
- const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize,
- const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch,
- const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion,
- const MCSymbol *HasIndirectCall) {
+ const MCSymbol *NumExplicitSGPR, const MCSymbol *NumNamedBarrier,
+ const MCSymbol *PrivateSegmentSize, const MCSymbol *UsesVCC,
+ const MCSymbol *UsesFlatScratch, const MCSymbol *HasDynamicallySizedStack,
+ const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall) {
#define PRINT_RES_INFO(ARG) \
OS << "\t.set "; \
ARG->print(OS, getContext().getAsmInfo()); \
@@ -290,6 +290,7 @@ void AMDGPUTargetAsmStreamer::EmitMCResourceInfo(
PRINT_RES_INFO(NumVGPR);
PRINT_RES_INFO(NumAGPR);
PRINT_RES_INFO(NumExplicitSGPR);
+ PRINT_RES_INFO(NumNamedBarrier);
PRINT_RES_INFO(PrivateSegmentSize);
PRINT_RES_INFO(UsesVCC);
PRINT_RES_INFO(UsesFlatScratch);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 9c49020850584..22afcdebcdf09 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -62,10 +62,10 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
virtual void EmitMCResourceInfo(
const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
- const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize,
- const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch,
- const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion,
- const MCSymbol *HasIndirectCall) {};
+ const MCSymbol *NumExplicitSGPR, const MCSymbol *NumNamedBarrier,
+ const MCSymbol *PrivateSegmentSize, const MCSymbol *UsesVCC,
+ const MCSymbol *UsesFlatScratch, const MCSymbol *HasDynamicallySizedStack,
+ const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall) {};
virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR,
const MCSymbol *MaxAGPR,
@@ -141,14 +141,12 @@ class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, Align Alignment) override;
- void EmitMCResourceInfo(const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
- const MCSymbol *NumExplicitSGPR,
- const MCSymbol *PrivateSegmentSize,
- const MCSymbol *UsesVCC,
- const MCSymbol *UsesFlatScratch,
- const MCSymbol *HasDynamicallySizedStack,
- const MCSymbol *HasRecursion,
- const MCSymbol *HasIndirectCall) override;
+ void EmitMCResourceInfo(
+ const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
+ const MCSymbol *NumExplicitSGPR, const MCSymbol *NumNamedBarrier,
+ const MCSymbol *PrivateSegmentSize, const MCSymbol *UsesVCC,
+ const MCSymbol *UsesFlatScratch, const MCSymbol *HasDynamicallySizedStack,
+ const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall) override;
void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR,
const MCSymbol *MaxSGPR) override;
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
index 5940f45e74bf2..93ba0a337d7dd 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
@@ -73,6 +73,7 @@ void SIProgramInfo::reset(const MachineFunction &MF) {
NumSGPRsForWavesPerEU = ZeroExpr;
NumVGPRsForWavesPerEU = ZeroExpr;
+ NamedBarCnt = ZeroExpr;
Occupancy = ZeroExpr;
DynamicCallStack = ZeroExpr;
VCCUsed = ZeroExpr;
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index 79099d2182cc8..171c4a313a53b 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -83,6 +83,9 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
// Number of VGPRs that meets number of waves per execution unit request.
const MCExpr *NumVGPRsForWavesPerEU = nullptr;
+ // Number of named barriers used by the kernel.
+ const MCExpr *NamedBarCnt = nullptr;
+
// Final occupancy.
const MCExpr *Occupancy = nullptr;
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
index 7cf8883082458..07189ee46e3a9 100644
--- a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
@@ -1,4 +1,5 @@
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=SOUT %s
@bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
@bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
@@ -9,6 +10,7 @@
; CHECK-NEXT: @bar1 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !2
; CHECK-NEXT: @bar1.kernel1 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !2
+; SOUT: .set func1.num_named_barrier, 3
define void @func1() {
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
@@ -16,6 +18,7 @@ define void @func1() {
ret void
}
+; SOUT: .set func2.num_named_barrier, 1
define void @func2() {
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
@@ -23,6 +26,7 @@ define void @func2() {
ret void
}
+; SOUT: .set kernel1.num_named_barrier, max(2, func1.num_named_barrier, func2.num_named_barrier)
define amdgpu_kernel void @kernel1() #0 {
; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11)
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
@@ -36,6 +40,7 @@ define amdgpu_kernel void @kernel1() #0 {
ret void
}
+; SOUT: .set kernel2.num_named_barrier, max(2, func2.num_named_barrier)
define amdgpu_kernel void @kernel2() #0 {
; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
>From 14e03a433f1f1979242643aa2b866435e5e26a80 Mon Sep 17 00:00:00 2001
From: Gang Chen <Gang.Chen at amd.com>
Date: Tue, 19 Aug 2025 12:23:28 -0700
Subject: [PATCH 2/3] [AMDGPU] fix format
---
llvm/include/llvm/Support/AMDHSAKernelDescriptor.h | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
index 4fb6fa656b6e7..78f38ed5a9d4b 100644
--- a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
+++ b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
@@ -187,11 +187,11 @@ enum : int32_t {
#define COMPUTE_PGM_RSRC3_GFX10_GFX11(NAME, SHIFT, WIDTH) \
AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_GFX11_##NAME, SHIFT, WIDTH)
// [GFX10-GFX120].
-#define COMPUTE_PGM_RSRC3_GFX10_GFX120(NAME, SHIFT, WIDTH) \
- AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_GFX120_ ## NAME, SHIFT, WIDTH)
+#define COMPUTE_PGM_RSRC3_GFX10_GFX120(NAME, SHIFT, WIDTH) \
+ AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_GFX120_##NAME, SHIFT, WIDTH)
// GFX11+.
-#define COMPUTE_PGM_RSRC3_GFX11_PLUS(NAME, SHIFT, WIDTH) \
- AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX11_PLUS_ ## NAME, SHIFT, WIDTH)
+#define COMPUTE_PGM_RSRC3_GFX11_PLUS(NAME, SHIFT, WIDTH) \
+ AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX11_PLUS_##NAME, SHIFT, WIDTH)
// [GFX11].
#define COMPUTE_PGM_RSRC3_GFX11(NAME, SHIFT, WIDTH) \
AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX11_##NAME, SHIFT, WIDTH)
>From 46ab50b915abc56ab73d2119f66c7762ada33077 Mon Sep 17 00:00:00 2001
From: Gang Chen <Gang.Chen at amd.com>
Date: Tue, 19 Aug 2025 15:46:13 -0700
Subject: [PATCH 3/3] [AMDGPU] fix test for s-barrier
---
llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll | 1 -
.../MC/Disassembler/AMDGPU/kernel-descriptor-rsrc-errors.test | 2 +-
2 files changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
index 07189ee46e3a9..5295a13461f69 100644
--- a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
@@ -32,7 +32,6 @@ define amdgpu_kernel void @kernel1() #0 {
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
- call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) @bar1)
%state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
call void @llvm.amdgcn.s.barrier()
call void @func1()
diff --git a/llvm/test/MC/Disassembler/AMDGPU/kernel-descriptor-rsrc-errors.test b/llvm/test/MC/Disassembler/AMDGPU/kernel-descriptor-rsrc-errors.test
index 1a54bac5fa40f..ad7bba076002f 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/kernel-descriptor-rsrc-errors.test
+++ b/llvm/test/MC/Disassembler/AMDGPU/kernel-descriptor-rsrc-errors.test
@@ -39,7 +39,7 @@
# RUN: yaml2obj %s -DGPU=GFX1100 -DSRC1=0300AC60 -DSRC2=80000000 -DSRC3=00000100 \
# RUN: | llvm-objdump --disassemble-symbols=test.kd - | FileCheck %s --check-prefix=RSRC3_10
-# RSRC3_10: ; error decoding test.kd: kernel descriptor COMPUTE_PGM_RSRC3 reserved bits in range (30:14) set, must be zero on gfx10+
+# RSRC3_10: ; error decoding test.kd: kernel descriptor COMPUTE_PGM_RSRC3 reserved bits in range (16:14) set, must be zero on gfx10+
# RSRC3_10-NEXT: ; decoding failed region as bytes
# RUN: yaml2obj %s -DGPU=GFX801 -DSRC1=0300AC60 -DSRC2=80000000 -DSRC3=00000001 \
More information about the llvm-commits
mailing list