[llvm-branch-commits] [clang] 4401468 - Revert "[AMDGPU][SIInsertWaitcnt] Implement Waitcnt Expansion for Profiling (…"
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Jan 14 00:46:03 PST 2026
Author: Pankaj Dwivedi
Date: 2026-01-14T14:15:59+05:30
New Revision: 44014689178470a5e327b26ff6255dc064eac39d
URL: https://github.com/llvm/llvm-project/commit/44014689178470a5e327b26ff6255dc064eac39d
DIFF: https://github.com/llvm/llvm-project/commit/44014689178470a5e327b26ff6255dc064eac39d.diff
LOG: Revert "[AMDGPU][SIInsertWaitcnt] Implement Waitcnt Expansion for Profiling (…"
This reverts commit 3dfb782333bf929945f63e5b0b1cad378b0bd87a.
Added:
Modified:
clang/include/clang/Basic/CodeGenOptions.def
clang/include/clang/Options/Options.td
clang/lib/CodeGen/Targets/AMDGPU.cpp
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
Removed:
llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
################################################################################
diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index baf8b093c10e6..6cdbffc456193 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -466,10 +466,6 @@ CODEGENOPT(AAPCSBitfieldWidth, 1, 1, Benign)
/// propagate signaling NaN inputs per IEEE 754-2008 (AMDGPU Only)
CODEGENOPT(EmitIEEENaNCompliantInsts, 1, 1, Benign)
-/// Enable expanded waitcnt for profiling (AMDGPU Only)
-/// Expands s_waitcnt instructions to help PC-sampling profilers identify stalls.
-CODEGENOPT(AMDGPUExpandWaitcntProfiling, 1, 0, Benign)
-
// Whether to emit Swift Async function extended frame information: auto,
// never, always.
ENUM_CODEGENOPT(SwiftAsyncFramePointer, SwiftAsyncFramePointerKind, 2,
diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td
index 2f57a5b13b917..5ad0ff2a773c8 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -5585,13 +5585,6 @@ defm amdgpu_ieee : BoolMOption<"amdgpu-ieee",
"This option changes the ABI. (AMDGPU only)">,
NegFlag<SetFalse, [], [ClangOption, CC1Option]>>;
-defm amdgpu_expand_waitcnt_profiling : BoolMOption<"amdgpu-expand-waitcnt-profiling",
- CodeGenOpts<"AMDGPUExpandWaitcntProfiling">, DefaultFalse,
- PosFlag<SetTrue, [], [ClangOption, CC1Option], "Expand s_waitcnt instructions to help "
- "PC-sampling profilers identify memory stalls. Instead of a single waitcnt(target), "
- "emits waitcnt(N-1), waitcnt(N-2), ..., waitcnt(target). (AMDGPU only)">,
- NegFlag<SetFalse, [], [ClangOption]>>;
-
def mcode_object_version_EQ : Joined<["-"], "mcode-object-version=">, Group<m_Group>,
HelpText<"Specify code object ABI version. Defaults to 6. (AMDGPU only)">,
Visibility<[ClangOption, FlangOption, CC1Option, FC1Option]>,
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 4bc9557b26b52..0ab6c753b8bad 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -443,8 +443,6 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes(
setFunctionDeclAttributes(FD, F, M);
if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
F->addFnAttr("amdgpu-ieee", "false");
- if (getABIInfo().getCodeGenOpts().AMDGPUExpandWaitcntProfiling)
- F->addFnAttr("amdgpu-expand-waitcnt-profiling");
}
unsigned AMDGPUTargetCodeGenInfo::getDeviceKernelCallingConv() const {
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index b3e834b66ad45..bf842e0ecb4af 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -105,35 +105,6 @@ auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
return enum_seq(LOAD_CNT, MaxCounter);
}
-// Get the maximum wait count value for a given counter type.
-static unsigned getWaitCountMax(const AMDGPU::HardwareLimits &Limits,
- InstCounterType T) {
- switch (T) {
- case LOAD_CNT:
- return Limits.LoadcntMax;
- case DS_CNT:
- return Limits.DscntMax;
- case EXP_CNT:
- return Limits.ExpcntMax;
- case STORE_CNT:
- return Limits.StorecntMax;
- case SAMPLE_CNT:
- return Limits.SamplecntMax;
- case BVH_CNT:
- return Limits.BvhcntMax;
- case KM_CNT:
- return Limits.KmcntMax;
- case X_CNT:
- return Limits.XcntMax;
- case VA_VDST:
- return Limits.VaVdstMax;
- case VM_VSRC:
- return Limits.VmVsrcMax;
- default:
- return 0;
- }
-}
-
/// Integer IDs used to track vector memory locations we may have to wait on.
/// Encoded as u16 chunks:
///
@@ -169,6 +140,19 @@ static constexpr VMEMID toVMEMID(MCRegUnit RU) {
return static_cast<unsigned>(RU);
}
+struct HardwareLimits {
+ unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
+ unsigned ExpcntMax;
+ unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12.
+ unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11.
+ unsigned SamplecntMax; // gfx12+ only.
+ unsigned BvhcntMax; // gfx12+ only.
+ unsigned KmcntMax; // gfx12+ only.
+ unsigned XcntMax; // gfx1250.
+ unsigned VaVdstMax; // gfx12+ expert mode only.
+ unsigned VmVsrcMax; // gfx12+ expert mode only.
+};
+
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
DECL(VMEM_ACCESS) /* vmem read & write (pre-gfx10), vmem read (gfx10+) */ \
DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */ \
@@ -330,27 +314,19 @@ class WaitcntGenerator {
AMDGPU::IsaVersion IV;
InstCounterType MaxCounter;
bool OptNone;
- bool ExpandWaitcntProfiling = false;
- const AMDGPU::HardwareLimits *Limits = nullptr;
public:
WaitcntGenerator() = default;
- WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter,
- const AMDGPU::HardwareLimits *Limits)
+ WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter)
: ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter),
OptNone(MF.getFunction().hasOptNone() ||
- MF.getTarget().getOptLevel() == CodeGenOptLevel::None),
- ExpandWaitcntProfiling(
- MF.getFunction().hasFnAttribute("amdgpu-expand-waitcnt-profiling")),
- Limits(Limits) {}
+ MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {}
// Return true if the current function should be compiled with no
// optimization.
bool isOptNone() const { return OptNone; }
- const AMDGPU::HardwareLimits &getLimits() const { return *Limits; }
-
// Edits an existing sequence of wait count instructions according
// to an incoming Waitcnt value, which is itself updated to reflect
// any new wait count instructions which may need to be generated by
@@ -372,11 +348,9 @@ class WaitcntGenerator {
// Generates new wait count instructions according to the value of
// Wait, returning true if any new instructions were created.
- // If ScoreBrackets is provided, it can be used for profiling expansion.
virtual bool createNewWaitcnt(MachineBasicBlock &Block,
MachineBasicBlock::instr_iterator It,
- AMDGPU::Waitcnt Wait,
- WaitcntBrackets *ScoreBrackets = nullptr) = 0;
+ AMDGPU::Waitcnt Wait) = 0;
// Returns an array of bit masks which can be used to map values in
// WaitEventType to corresponding counter values in InstCounterType.
@@ -401,10 +375,7 @@ class WaitcntGenerator {
class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
public:
- WaitcntGeneratorPreGFX12() = default;
- WaitcntGeneratorPreGFX12(const MachineFunction &MF,
- const AMDGPU::HardwareLimits *Limits)
- : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS, Limits) {}
+ using WaitcntGenerator::WaitcntGenerator;
bool
applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
@@ -413,8 +384,7 @@ class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
bool createNewWaitcnt(MachineBasicBlock &Block,
MachineBasicBlock::instr_iterator It,
- AMDGPU::Waitcnt Wait,
- WaitcntBrackets *ScoreBrackets = nullptr) override;
+ AMDGPU::Waitcnt Wait) override;
const unsigned *getWaitEventMask() const override {
assert(ST);
@@ -446,10 +416,8 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
public:
WaitcntGeneratorGFX12Plus() = default;
WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
- InstCounterType MaxCounter,
- const AMDGPU::HardwareLimits *Limits,
- bool IsExpertMode)
- : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
+ InstCounterType MaxCounter, bool IsExpertMode)
+ : WaitcntGenerator(MF, MaxCounter), IsExpertMode(IsExpertMode) {}
bool
applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
@@ -458,8 +426,7 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
bool createNewWaitcnt(MachineBasicBlock &Block,
MachineBasicBlock::instr_iterator It,
- AMDGPU::Waitcnt Wait,
- WaitcntBrackets *ScoreBrackets = nullptr) override;
+ AMDGPU::Waitcnt Wait) override;
const unsigned *getWaitEventMask() const override {
assert(ST);
@@ -533,7 +500,7 @@ class SIInsertWaitcnts {
// message.
DenseSet<MachineInstr *> ReleaseVGPRInsts;
- AMDGPU::HardwareLimits Limits;
+ HardwareLimits Limits;
public:
SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
@@ -544,7 +511,33 @@ class SIInsertWaitcnts {
(void)ForceVMCounter;
}
- const AMDGPU::HardwareLimits &getLimits() const { return Limits; }
+ unsigned getWaitCountMax(InstCounterType T) const {
+ switch (T) {
+ case LOAD_CNT:
+ return Limits.LoadcntMax;
+ case DS_CNT:
+ return Limits.DscntMax;
+ case EXP_CNT:
+ return Limits.ExpcntMax;
+ case STORE_CNT:
+ return Limits.StorecntMax;
+ case SAMPLE_CNT:
+ return Limits.SamplecntMax;
+ case BVH_CNT:
+ return Limits.BvhcntMax;
+ case KM_CNT:
+ return Limits.KmcntMax;
+ case X_CNT:
+ return Limits.XcntMax;
+ case VA_VDST:
+ return Limits.VaVdstMax;
+ case VM_VSRC:
+ return Limits.VmVsrcMax;
+ default:
+ break;
+ }
+ return 0;
+ }
PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *ML,
const WaitcntBrackets &Brackets);
@@ -776,7 +769,7 @@ class WaitcntBrackets {
unsigned getPendingGDSWait() const {
return std::min(getScoreUB(DS_CNT) - LastGDS,
- getWaitCountMax(Context->getLimits(), DS_CNT) - 1);
+ Context->getWaitCountMax(DS_CNT) - 1);
}
void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
@@ -803,8 +796,8 @@ class WaitcntBrackets {
}
void setStateOnFunctionEntryOrReturn() {
- setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) +
- getWaitCountMax(Context->getLimits(), STORE_CNT));
+ setScoreUB(STORE_CNT,
+ getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT));
PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT];
}
@@ -860,9 +853,8 @@ class WaitcntBrackets {
if (T != EXP_CNT)
return;
- if (getScoreRange(EXP_CNT) > getWaitCountMax(Context->getLimits(), EXP_CNT))
- ScoreLBs[EXP_CNT] =
- ScoreUBs[EXP_CNT] - getWaitCountMax(Context->getLimits(), EXP_CNT);
+ if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT))
+ ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Context->getWaitCountMax(EXP_CNT);
}
void setRegScore(MCPhysReg Reg, InstCounterType T, unsigned Val) {
@@ -1365,8 +1357,8 @@ void WaitcntBrackets::determineWaitForScore(InstCounterType T,
} else {
// If a counter has been maxed out avoid overflow by waiting for
// MAX(CounterType) - 1 instead.
- unsigned NeededWait = std::min(
- UB - ScoreToWait, getWaitCountMax(Context->getLimits(), T) - 1);
+ unsigned NeededWait =
+ std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1);
addWait(Wait, T, NeededWait);
}
}
@@ -1683,109 +1675,38 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
/// required counters in \p Wait
bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
- AMDGPU::Waitcnt Wait, WaitcntBrackets *ScoreBrackets) {
+ AMDGPU::Waitcnt Wait) {
assert(ST);
assert(isNormalMode(MaxCounter));
bool Modified = false;
const DebugLoc &DL = Block.findDebugLoc(It);
- // Helper to emit expanded waitcnt sequence for profiling.
- // Emits waitcnts from (Outstanding-1) down to Target, or just Target if
- // nothing to expand. The EmitWaitcnt callback emits a single waitcnt.
- auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
- auto EmitWaitcnt) {
- if (Outstanding > Target) {
- for (unsigned i = Outstanding - 1; i >= Target && i != ~0u; --i) {
- EmitWaitcnt(i);
- Modified = true;
- }
- } else {
- EmitWaitcnt(Target);
- Modified = true;
- }
- };
-
// Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
// single instruction while VScnt has its own instruction.
if (Wait.hasWaitExceptStoreCnt()) {
- // If profiling expansion is enabled and we have score brackets,
- // emit an expanded sequence
- if (ExpandWaitcntProfiling && ScoreBrackets) {
- // Check if any of the counters to be waited on are out-of-order.
- // If so, fall back to normal (non-expanded) behavior since expansion
- // would provide misleading profiling information.
- bool AnyOutOfOrder = false;
- for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
- unsigned &WaitCnt = getCounterRef(Wait, CT);
- if (WaitCnt != ~0u && ScoreBrackets->counterOutOfOrder(CT)) {
- AnyOutOfOrder = true;
- break;
- }
- }
-
- if (AnyOutOfOrder) {
- // Fall back to non-expanded wait
- unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+ [[maybe_unused]] auto SWaitInst =
BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
- Modified = true;
- } else {
- // All counters are in-order, safe to expand
- for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
- unsigned &WaitCnt = getCounterRef(Wait, CT);
- if (WaitCnt == ~0u)
- continue;
-
- unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) -
- ScoreBrackets->getScoreLB(CT),
- getWaitCountMax(getLimits(), CT) - 1);
- EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) {
- AMDGPU::Waitcnt W;
- getCounterRef(W, CT) = Count;
- BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT))
- .addImm(AMDGPU::encodeWaitcnt(IV, W));
- });
- }
- }
- } else {
- // Normal behavior: emit single combined waitcnt
- unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
- [[maybe_unused]] auto SWaitInst =
- BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
- Modified = true;
+ Modified = true;
- LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
- if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
- dbgs() << "New Instr: " << *SWaitInst << '\n');
- }
+ LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
+ if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+ dbgs() << "New Instr: " << *SWaitInst << '\n');
}
if (Wait.hasWaitStoreCnt()) {
assert(ST->hasVscnt());
- if (ExpandWaitcntProfiling && ScoreBrackets && Wait.StoreCnt != ~0u &&
- !ScoreBrackets->counterOutOfOrder(STORE_CNT)) {
- // Only expand if counter is not out-of-order
- unsigned Outstanding =
- std::min(ScoreBrackets->getScoreUB(STORE_CNT) -
- ScoreBrackets->getScoreLB(STORE_CNT),
- getWaitCountMax(getLimits(), STORE_CNT) - 1);
- EmitExpandedWaitcnt(Outstanding, Wait.StoreCnt, [&](unsigned Count) {
+ [[maybe_unused]] auto SWaitInst =
BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
- .addImm(Count);
- });
- } else {
- [[maybe_unused]] auto SWaitInst =
- BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
- .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
- .addImm(Wait.StoreCnt);
- Modified = true;
+ .addImm(Wait.StoreCnt);
+ Modified = true;
- LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
- if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
- dbgs() << "New Instr: " << *SWaitInst << '\n');
- }
+ LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
+ if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+ dbgs() << "New Instr: " << *SWaitInst << '\n');
}
return Modified;
@@ -2082,55 +2003,13 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
- AMDGPU::Waitcnt Wait, WaitcntBrackets *ScoreBrackets) {
+ AMDGPU::Waitcnt Wait) {
assert(ST);
assert(!isNormalMode(MaxCounter));
bool Modified = false;
const DebugLoc &DL = Block.findDebugLoc(It);
- // Helper to emit expanded waitcnt sequence for profiling.
- auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
- auto EmitWaitcnt) {
- if (Outstanding > Target) {
- for (unsigned i = Outstanding - 1; i >= Target && i != ~0u; --i) {
- EmitWaitcnt(i);
- Modified = true;
- }
- } else {
- EmitWaitcnt(Target);
- Modified = true;
- }
- };
-
- // For GFX12+, we use separate wait instructions, which makes expansion
- // simpler
- if (ExpandWaitcntProfiling && ScoreBrackets) {
- for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
- unsigned Count = getWait(Wait, CT);
- if (Count == ~0u)
- continue;
-
- // Skip expansion for out-of-order counters - emit normal wait instead
- if (ScoreBrackets->counterOutOfOrder(CT)) {
- BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
- .addImm(Count);
- Modified = true;
- continue;
- }
-
- unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) -
- ScoreBrackets->getScoreLB(CT),
- getWaitCountMax(getLimits(), CT) - 1);
- EmitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) {
- BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
- .addImm(Val);
- });
- }
- return Modified;
- }
-
- // Normal behavior (no expansion)
// Check for opportunities to use combined wait instructions.
if (Wait.DsCnt != ~0u) {
MachineInstr *SWaitInst = nullptr;
@@ -2529,7 +2408,9 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
Modified =
WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
- AMDGPU::Waitcnt WaitForScore = Wait;
+ // Any counts that could have been applied to any existing waitcnt
+ // instructions will have been done so, now deal with any remaining.
+ ScoreBrackets.applyWaitcnt(Wait);
// ExpCnt can be merged into VINTERP.
if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
@@ -2546,13 +2427,9 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
<< "Update Instr: " << *It);
}
- if (WCG->createNewWaitcnt(Block, It, Wait, &ScoreBrackets))
+ if (WCG->createNewWaitcnt(Block, It, Wait))
Modified = true;
- // Any counts that could have been applied to any existing waitcnt
- // instructions will have been done so, now deal with any remaining.
- ScoreBrackets.applyWaitcnt(WaitForScore);
-
return Modified;
}
@@ -3259,9 +3136,6 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
- // Initialize hardware limits first, as they're needed by the generators.
- Limits = AMDGPU::HardwareLimits(IV, ST->hasExtendedWaitCounts());
-
if (ST->hasExtendedWaitCounts()) {
IsExpertMode = ST->hasExpertSchedulingMode() &&
(ExpertSchedulingModeFlag.getNumOccurrences()
@@ -3270,12 +3144,11 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
.getFnAttribute("amdgpu-expert-scheduling-mode")
.getValueAsBool());
MaxCounter = IsExpertMode ? NUM_EXPERT_INST_CNTS : NUM_EXTENDED_INST_CNTS;
- WCGGFX12Plus =
- WaitcntGeneratorGFX12Plus(MF, MaxCounter, &Limits, IsExpertMode);
+ WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter, IsExpertMode);
WCG = &WCGGFX12Plus;
} else {
MaxCounter = NUM_NORMAL_INST_CNTS;
- WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF, &Limits);
+ WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF, MaxCounter);
WCG = &WCGPreGFX12;
}
@@ -3286,6 +3159,22 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
+ if (ST->hasExtendedWaitCounts()) {
+ Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
+ Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
+ } else {
+ Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV);
+ Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV);
+ }
+ Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
+ Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV);
+ Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV);
+ Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
+ Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
+ Limits.XcntMax = AMDGPU::getXcntBitMask(IV);
+ Limits.VaVdstMax = AMDGPU::DepCtr::getVaVdstBitMask();
+ Limits.VmVsrcMax = AMDGPU::DepCtr::getVmVsrcBitMask();
+
BlockInfos.clear();
bool Modified = false;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index a6017f57714d4..ef384999851e9 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1785,25 +1785,6 @@ unsigned getStorecntBitMask(const IsaVersion &Version) {
return (1 << getStorecntBitWidth(Version.Major)) - 1;
}
-HardwareLimits::HardwareLimits(const IsaVersion &IV,
- bool HasExtendedWaitCounts) {
- if (HasExtendedWaitCounts) {
- LoadcntMax = getLoadcntBitMask(IV);
- DscntMax = getDscntBitMask(IV);
- } else {
- LoadcntMax = getVmcntBitMask(IV);
- DscntMax = getLgkmcntBitMask(IV);
- }
- ExpcntMax = getExpcntBitMask(IV);
- StorecntMax = getStorecntBitMask(IV);
- SamplecntMax = getSamplecntBitMask(IV);
- BvhcntMax = getBvhcntBitMask(IV);
- KmcntMax = getKmcntBitMask(IV);
- XcntMax = getXcntBitMask(IV);
- VaVdstMax = DepCtr::getVaVdstBitMask();
- VmVsrcMax = DepCtr::getVmVsrcBitMask();
-}
-
unsigned getWaitcntBitMask(const IsaVersion &Version) {
unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(Version.Major),
getVmcntBitWidthLo(Version.Major));
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 770f9a86dc883..f6b95602644ca 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1131,26 +1131,6 @@ struct Waitcnt {
friend raw_ostream &operator<<(raw_ostream &OS, const AMDGPU::Waitcnt &Wait);
};
-/// Represents the hardware counter limits for
diff erent wait count types.
-struct HardwareLimits {
- unsigned LoadcntMax; // Corresponds to Vmcnt prior to gfx12.
- unsigned ExpcntMax;
- unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12.
- unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11.
- unsigned SamplecntMax; // gfx12+ only.
- unsigned BvhcntMax; // gfx12+ only.
- unsigned KmcntMax; // gfx12+ only.
- unsigned XcntMax; // gfx1250.
- unsigned VaVdstMax; // gfx12+ expert mode only.
- unsigned VmVsrcMax; // gfx12+ expert mode only.
-
- HardwareLimits() = default;
-
- /// Initializes hardware limits from ISA version.
- /// \p HasExtendedWaitCounts should be true for gfx12+.
- HardwareLimits(const IsaVersion &IV, bool HasExtendedWaitCounts);
-};
-
// The following methods are only meaningful on targets that support
// S_WAITCNT.
diff --git a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
deleted file mode 100644
index 848a9d07084ed..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
+++ /dev/null
@@ -1,944 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc -mtriple=amdgcn -mcpu=gfx900 | FileCheck --check-prefix=GFX9-EXPAND %s
-; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx900 | FileCheck --check-prefix=GFX9-NOEXPAND %s
-; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc -mtriple=amdgcn -mcpu=gfx1010 | FileCheck --check-prefix=GFX10-EXPAND %s
-; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx1010 | FileCheck --check-prefix=GFX10-NOEXPAND %s
-; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc -mtriple=amdgcn -mcpu=gfx1100 | FileCheck --check-prefix=GFX11-EXPAND %s
-; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx1100 | FileCheck --check-prefix=GFX11-NOEXPAND %s
-; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc -mtriple=amdgcn -mcpu=gfx1200 | FileCheck --check-prefix=GFX12-EXPAND %s
-; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx1200 | FileCheck --check-prefix=GFX12-NOEXPAND %s
-
-; When -amdgpu-expand-waitcnt-profiling is enabled and there are N outstanding
-; operations, instead of emitting a single waitcnt(target), we emit:
-; waitcnt(N-1), waitcnt(N-2), ..., waitcnt(target)
-;
-; This allows PC-sampling profilers to identify which specific operation
-; is causing a stall by observing where the program counter is stuck.
-
-define amdgpu_kernel void @test_lgkmcnt_scalar_loads(ptr addrspace(4) %ptr_a, ptr addrspace(4) %ptr_b, ptr addrspace(4) %ptr_c, ptr addrspace(1) %out) #0 {
-; GFX9-EXPAND-LABEL: test_lgkmcnt_scalar_loads:
-; GFX9-EXPAND: ; %bb.0:
-; GFX9-EXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-EXPAND-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-EXPAND-NEXT: s_load_dword s0, s[8:9], 0x0
-; GFX9-EXPAND-NEXT: s_load_dword s1, s[10:11], 0x0
-; GFX9-EXPAND-NEXT: s_load_dword s2, s[12:13], 0x0
-; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-EXPAND-NEXT: s_add_i32 s0, s0, s1
-; GFX9-EXPAND-NEXT: s_add_i32 s0, s0, s2
-; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[14:15]
-; GFX9-EXPAND-NEXT: s_endpgm
-;
-; GFX9-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads:
-; GFX9-NOEXPAND: ; %bb.0:
-; GFX9-NOEXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NOEXPAND-NEXT: s_load_dword s0, s[8:9], 0x0
-; GFX9-NOEXPAND-NEXT: s_load_dword s1, s[10:11], 0x0
-; GFX9-NOEXPAND-NEXT: s_load_dword s2, s[12:13], 0x0
-; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NOEXPAND-NEXT: s_add_i32 s0, s0, s1
-; GFX9-NOEXPAND-NEXT: s_add_i32 s0, s0, s2
-; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[14:15]
-; GFX9-NOEXPAND-NEXT: s_endpgm
-;
-; GFX10-EXPAND-LABEL: test_lgkmcnt_scalar_loads:
-; GFX10-EXPAND: ; %bb.0:
-; GFX10-EXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX10-EXPAND-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-EXPAND-NEXT: s_load_dword s0, s[8:9], 0x0
-; GFX10-EXPAND-NEXT: s_load_dword s1, s[10:11], 0x0
-; GFX10-EXPAND-NEXT: s_load_dword s2, s[12:13], 0x0
-; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-EXPAND-NEXT: s_add_i32 s0, s0, s1
-; GFX10-EXPAND-NEXT: s_add_i32 s0, s0, s2
-; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, s0
-; GFX10-EXPAND-NEXT: global_store_dword v0, v1, s[14:15]
-; GFX10-EXPAND-NEXT: s_endpgm
-;
-; GFX10-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads:
-; GFX10-NOEXPAND: ; %bb.0:
-; GFX10-NOEXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NOEXPAND-NEXT: s_load_dword s0, s[8:9], 0x0
-; GFX10-NOEXPAND-NEXT: s_load_dword s1, s[10:11], 0x0
-; GFX10-NOEXPAND-NEXT: s_load_dword s2, s[12:13], 0x0
-; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NOEXPAND-NEXT: s_add_i32 s0, s0, s1
-; GFX10-NOEXPAND-NEXT: s_add_i32 s0, s0, s2
-; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0
-; GFX10-NOEXPAND-NEXT: global_store_dword v0, v1, s[14:15]
-; GFX10-NOEXPAND-NEXT: s_endpgm
-;
-; GFX11-EXPAND-LABEL: test_lgkmcnt_scalar_loads:
-; GFX11-EXPAND: ; %bb.0:
-; GFX11-EXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX11-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0
-; GFX11-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-EXPAND-NEXT: s_add_i32 s0, s0, s1
-; GFX11-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-EXPAND-NEXT: s_add_i32 s0, s0, s2
-; GFX11-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX11-EXPAND-NEXT: global_store_b32 v0, v1, s[6:7]
-; GFX11-EXPAND-NEXT: s_endpgm
-;
-; GFX11-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads:
-; GFX11-NOEXPAND: ; %bb.0:
-; GFX11-NOEXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX11-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0
-; GFX11-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NOEXPAND-NEXT: s_add_i32 s0, s0, s1
-; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NOEXPAND-NEXT: s_add_i32 s0, s0, s2
-; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v1, s[6:7]
-; GFX11-NOEXPAND-NEXT: s_endpgm
-;
-; GFX12-EXPAND-LABEL: test_lgkmcnt_scalar_loads:
-; GFX12-EXPAND: ; %bb.0:
-; GFX12-EXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0
-; GFX12-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX12-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0
-; GFX12-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0
-; GFX12-EXPAND-NEXT: s_add_co_i32 s0, s0, s1
-; GFX12-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-EXPAND-NEXT: s_add_co_i32 s0, s0, s2
-; GFX12-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX12-EXPAND-NEXT: global_store_b32 v0, v1, s[6:7]
-; GFX12-EXPAND-NEXT: s_endpgm
-;
-; GFX12-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads:
-; GFX12-NOEXPAND: ; %bb.0:
-; GFX12-NOEXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0
-; GFX12-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX12-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0
-; GFX12-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0
-; GFX12-NOEXPAND-NEXT: s_add_co_i32 s0, s0, s1
-; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NOEXPAND-NEXT: s_add_co_i32 s0, s0, s2
-; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v1, s[6:7]
-; GFX12-NOEXPAND-NEXT: s_endpgm
-
- %val_a = load i32, ptr addrspace(4) %ptr_a, align 4
- %val_b = load i32, ptr addrspace(4) %ptr_b, align 4
- %val_c = load i32, ptr addrspace(4) %ptr_c, align 4
- %sum1 = add i32 %val_a, %val_b
- %sum2 = add i32 %sum1, %val_c
- store i32 %sum2, ptr addrspace(1) %out, align 4
- ret void
-}
-
-define amdgpu_kernel void @test_vmcnt_global_loads(ptr addrspace(1) %buf, ptr addrspace(1) %out) #0 {
-; GFX9-EXPAND-LABEL: test_vmcnt_global_loads:
-; GFX9-EXPAND: ; %bb.0:
-; GFX9-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-EXPAND-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-EXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256
-; GFX9-EXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512
-; GFX9-EXPAND-NEXT: s_waitcnt vmcnt(2)
-; GFX9-EXPAND-NEXT: s_waitcnt vmcnt(1)
-; GFX9-EXPAND-NEXT: s_waitcnt vmcnt(0)
-; GFX9-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3
-; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX9-EXPAND-NEXT: s_endpgm
-;
-; GFX9-NOEXPAND-LABEL: test_vmcnt_global_loads:
-; GFX9-NOEXPAND: ; %bb.0:
-; GFX9-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NOEXPAND-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NOEXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256
-; GFX9-NOEXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512
-; GFX9-NOEXPAND-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3
-; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX9-NOEXPAND-NEXT: s_endpgm
-;
-; GFX10-EXPAND-LABEL: test_vmcnt_global_loads:
-; GFX10-EXPAND: ; %bb.0:
-; GFX10-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-EXPAND-NEXT: s_clause 0x2
-; GFX10-EXPAND-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX10-EXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256
-; GFX10-EXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512
-; GFX10-EXPAND-NEXT: s_waitcnt vmcnt(2)
-; GFX10-EXPAND-NEXT: s_waitcnt vmcnt(1)
-; GFX10-EXPAND-NEXT: s_waitcnt vmcnt(0)
-; GFX10-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3
-; GFX10-EXPAND-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX10-EXPAND-NEXT: s_endpgm
-;
-; GFX10-NOEXPAND-LABEL: test_vmcnt_global_loads:
-; GFX10-NOEXPAND: ; %bb.0:
-; GFX10-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NOEXPAND-NEXT: s_clause 0x2
-; GFX10-NOEXPAND-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX10-NOEXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256
-; GFX10-NOEXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512
-; GFX10-NOEXPAND-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3
-; GFX10-NOEXPAND-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX10-NOEXPAND-NEXT: s_endpgm
-;
-; GFX11-EXPAND-LABEL: test_vmcnt_global_loads:
-; GFX11-EXPAND: ; %bb.0:
-; GFX11-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-EXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-EXPAND-NEXT: s_clause 0x2
-; GFX11-EXPAND-NEXT: global_load_b32 v1, v0, s[0:1]
-; GFX11-EXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256
-; GFX11-EXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512
-; GFX11-EXPAND-NEXT: s_waitcnt vmcnt(2)
-; GFX11-EXPAND-NEXT: s_waitcnt vmcnt(1)
-; GFX11-EXPAND-NEXT: s_waitcnt vmcnt(0)
-; GFX11-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3
-; GFX11-EXPAND-NEXT: global_store_b32 v0, v1, s[2:3]
-; GFX11-EXPAND-NEXT: s_endpgm
-;
-; GFX11-NOEXPAND-LABEL: test_vmcnt_global_loads:
-; GFX11-NOEXPAND: ; %bb.0:
-; GFX11-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NOEXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NOEXPAND-NEXT: s_clause 0x2
-; GFX11-NOEXPAND-NEXT: global_load_b32 v1, v0, s[0:1]
-; GFX11-NOEXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256
-; GFX11-NOEXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512
-; GFX11-NOEXPAND-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3
-; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v1, s[2:3]
-; GFX11-NOEXPAND-NEXT: s_endpgm
-;
-; GFX12-EXPAND-LABEL: test_vmcnt_global_loads:
-; GFX12-EXPAND: ; %bb.0:
-; GFX12-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-EXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0
-; GFX12-EXPAND-NEXT: s_clause 0x2
-; GFX12-EXPAND-NEXT: global_load_b32 v1, v0, s[0:1]
-; GFX12-EXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256
-; GFX12-EXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512
-; GFX12-EXPAND-NEXT: s_wait_loadcnt 0x2
-; GFX12-EXPAND-NEXT: s_wait_loadcnt 0x1
-; GFX12-EXPAND-NEXT: s_wait_loadcnt 0x0
-; GFX12-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3
-; GFX12-EXPAND-NEXT: global_store_b32 v0, v1, s[2:3]
-; GFX12-EXPAND-NEXT: s_endpgm
-;
-; GFX12-NOEXPAND-LABEL: test_vmcnt_global_loads:
-; GFX12-NOEXPAND: ; %bb.0:
-; GFX12-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NOEXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0
-; GFX12-NOEXPAND-NEXT: s_clause 0x2
-; GFX12-NOEXPAND-NEXT: global_load_b32 v1, v0, s[0:1]
-; GFX12-NOEXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256
-; GFX12-NOEXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512
-; GFX12-NOEXPAND-NEXT: s_wait_loadcnt 0x0
-; GFX12-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3
-; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v1, s[2:3]
-; GFX12-NOEXPAND-NEXT: s_endpgm
-
- ; Use thread ID to create thread-varying addresses -> forces vector loads
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid64 = zext i32 %tid to i64
-
- ; Three separate global loads with thread-varying addresses
- ; Non-volatile loads allow multiple operations to be in-flight
- %ptr0 = getelementptr i32, ptr addrspace(1) %buf, i64 %tid64
- %val0 = load i32, ptr addrspace(1) %ptr0, align 4
-
- %offset1 = add i64 %tid64, 64
- %ptr1 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset1
- %val1 = load i32, ptr addrspace(1) %ptr1, align 4
-
- %offset2 = add i64 %tid64, 128
- %ptr2 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset2
- %val2 = load i32, ptr addrspace(1) %ptr2, align 4
-
- %sum1 = add i32 %val0, %val1
- %sum2 = add i32 %sum1, %val2
-
- %out_ptr = getelementptr i32, ptr addrspace(1) %out, i64 %tid64
- store i32 %sum2, ptr addrspace(1) %out_ptr, align 4
- ret void
-}
-
-declare i32 @llvm.amdgcn.workitem.id.x()
-
-define amdgpu_kernel void @test_lgkmcnt_lds_operations(ptr addrspace(3) %lds_ptr, ptr addrspace(1) %out) #0 {
-; GFX9-EXPAND-LABEL: test_lgkmcnt_lds_operations:
-; GFX9-EXPAND: ; %bb.0:
-; GFX9-EXPAND-NEXT: s_load_dword s2, s[4:5], 0x24
-; GFX9-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX9-EXPAND-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-EXPAND-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-EXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
-; GFX9-EXPAND-NEXT: ds_read_b32 v2, v2 offset:8
-; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
-; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, v0, v1
-; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, v0, v2
-; GFX9-EXPAND-NEXT: global_store_dword v3, v0, s[0:1]
-; GFX9-EXPAND-NEXT: s_endpgm
-;
-; GFX9-NOEXPAND-LABEL: test_lgkmcnt_lds_operations:
-; GFX9-NOEXPAND: ; %bb.0:
-; GFX9-NOEXPAND-NEXT: s_load_dword s2, s[4:5], 0x24
-; GFX9-NOEXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
-; GFX9-NOEXPAND-NEXT: ds_read_b32 v2, v2 offset:8
-; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(1)
-; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, v0, v1
-; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, v0, v2
-; GFX9-NOEXPAND-NEXT: global_store_dword v3, v0, s[0:1]
-; GFX9-NOEXPAND-NEXT: s_endpgm
-;
-; GFX10-EXPAND-LABEL: test_lgkmcnt_lds_operations:
-; GFX10-EXPAND: ; %bb.0:
-; GFX10-EXPAND-NEXT: s_clause 0x1
-; GFX10-EXPAND-NEXT: s_load_dword s2, s[4:5], 0x24
-; GFX10-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-EXPAND-NEXT: v_mov_b32_e32 v2, s2
-; GFX10-EXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
-; GFX10-EXPAND-NEXT: ds_read_b32 v2, v2 offset:8
-; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
-; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1
-; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2
-; GFX10-EXPAND-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX10-EXPAND-NEXT: s_endpgm
-;
-; GFX10-NOEXPAND-LABEL: test_lgkmcnt_lds_operations:
-; GFX10-NOEXPAND: ; %bb.0:
-; GFX10-NOEXPAND-NEXT: s_clause 0x1
-; GFX10-NOEXPAND-NEXT: s_load_dword s2, s[4:5], 0x24
-; GFX10-NOEXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2
-; GFX10-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
-; GFX10-NOEXPAND-NEXT: ds_read_b32 v2, v2 offset:8
-; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(1)
-; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1
-; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2
-; GFX10-NOEXPAND-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX10-NOEXPAND-NEXT: s_endpgm
-;
-; GFX11-EXPAND-LABEL: test_lgkmcnt_lds_operations:
-; GFX11-EXPAND: ; %bb.0:
-; GFX11-EXPAND-NEXT: s_clause 0x1
-; GFX11-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24
-; GFX11-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
-; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-EXPAND-NEXT: v_mov_b32_e32 v2, s2
-; GFX11-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1
-; GFX11-EXPAND-NEXT: ds_load_b32 v2, v2 offset:8
-; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(1)
-; GFX11-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
-; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2
-; GFX11-EXPAND-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX11-EXPAND-NEXT: s_endpgm
-;
-; GFX11-NOEXPAND-LABEL: test_lgkmcnt_lds_operations:
-; GFX11-NOEXPAND: ; %bb.0:
-; GFX11-NOEXPAND-NEXT: s_clause 0x1
-; GFX11-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24
-; GFX11-NOEXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
-; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2
-; GFX11-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1
-; GFX11-NOEXPAND-NEXT: ds_load_b32 v2, v2 offset:8
-; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(1)
-; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
-; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2
-; GFX11-NOEXPAND-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX11-NOEXPAND-NEXT: s_endpgm
-;
-; GFX12-EXPAND-LABEL: test_lgkmcnt_lds_operations:
-; GFX12-EXPAND: ; %bb.0:
-; GFX12-EXPAND-NEXT: s_clause 0x1
-; GFX12-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24
-; GFX12-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
-; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0
-; GFX12-EXPAND-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1
-; GFX12-EXPAND-NEXT: ds_load_b32 v2, v2 offset:8
-; GFX12-EXPAND-NEXT: s_wait_dscnt 0x1
-; GFX12-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
-; GFX12-EXPAND-NEXT: s_wait_dscnt 0x0
-; GFX12-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2
-; GFX12-EXPAND-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX12-EXPAND-NEXT: s_endpgm
-;
-; GFX12-NOEXPAND-LABEL: test_lgkmcnt_lds_operations:
-; GFX12-NOEXPAND: ; %bb.0:
-; GFX12-NOEXPAND-NEXT: s_clause 0x1
-; GFX12-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24
-; GFX12-NOEXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
-; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0
-; GFX12-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1
-; GFX12-NOEXPAND-NEXT: ds_load_b32 v2, v2 offset:8
-; GFX12-NOEXPAND-NEXT: s_wait_dscnt 0x1
-; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
-; GFX12-NOEXPAND-NEXT: s_wait_dscnt 0x0
-; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2
-; GFX12-NOEXPAND-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX12-NOEXPAND-NEXT: s_endpgm
-
- %ptr0 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 0
- %ptr1 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 1
- %ptr2 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 2
- %val0 = load i32, ptr addrspace(3) %ptr0, align 4
- %val1 = load i32, ptr addrspace(3) %ptr1, align 4
- %val2 = load i32, ptr addrspace(3) %ptr2, align 4
- %sum1 = add i32 %val0, %val1
- %sum2 = add i32 %sum1, %val2
- store i32 %sum2, ptr addrspace(1) %out, align 4
- ret void
-}
-
-define amdgpu_kernel void @test_combined_vmcnt_lgkmcnt(ptr addrspace(4) %scalar_ptr_a, ptr addrspace(4) %scalar_ptr_b, ptr addrspace(1) %out) #0 {
-; GFX9-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
-; GFX9-EXPAND: ; %bb.0:
-; GFX9-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX9-EXPAND-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-EXPAND-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX9-EXPAND-NEXT: s_load_dword s5, s[2:3], 0x0
-; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-EXPAND-NEXT: s_add_i32 s0, s4, s5
-; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[6:7]
-; GFX9-EXPAND-NEXT: s_endpgm
-;
-; GFX9-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
-; GFX9-NOEXPAND: ; %bb.0:
-; GFX9-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NOEXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NOEXPAND-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX9-NOEXPAND-NEXT: s_load_dword s5, s[2:3], 0x0
-; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NOEXPAND-NEXT: s_add_i32 s0, s4, s5
-; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[6:7]
-; GFX9-NOEXPAND-NEXT: s_endpgm
-;
-; GFX10-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
-; GFX10-EXPAND: ; %bb.0:
-; GFX10-EXPAND-NEXT: s_clause 0x1
-; GFX10-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10-EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-EXPAND-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-EXPAND-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX10-EXPAND-NEXT: s_load_dword s5, s[2:3], 0x0
-; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-EXPAND-NEXT: s_add_i32 s0, s4, s5
-; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, s0
-; GFX10-EXPAND-NEXT: global_store_dword v0, v1, s[6:7]
-; GFX10-EXPAND-NEXT: s_endpgm
-;
-; GFX10-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
-; GFX10-NOEXPAND: ; %bb.0:
-; GFX10-NOEXPAND-NEXT: s_clause 0x1
-; GFX10-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10-NOEXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NOEXPAND-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX10-NOEXPAND-NEXT: s_load_dword s5, s[2:3], 0x0
-; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NOEXPAND-NEXT: s_add_i32 s0, s4, s5
-; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0
-; GFX10-NOEXPAND-NEXT: global_store_dword v0, v1, s[6:7]
-; GFX10-NOEXPAND-NEXT: s_endpgm
-;
-; GFX11-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
-; GFX11-EXPAND: ; %bb.0:
-; GFX11-EXPAND-NEXT: s_clause 0x1
-; GFX11-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-EXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX11-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0
-; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-EXPAND-NEXT: s_add_i32 s0, s0, s1
-; GFX11-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX11-EXPAND-NEXT: global_store_b32 v0, v1, s[4:5]
-; GFX11-EXPAND-NEXT: s_endpgm
-;
-; GFX11-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
-; GFX11-NOEXPAND: ; %bb.0:
-; GFX11-NOEXPAND-NEXT: s_clause 0x1
-; GFX11-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NOEXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX11-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0
-; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NOEXPAND-NEXT: s_add_i32 s0, s0, s1
-; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v1, s[4:5]
-; GFX11-NOEXPAND-NEXT: s_endpgm
-;
-; GFX12-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
-; GFX12-EXPAND: ; %bb.0:
-; GFX12-EXPAND-NEXT: s_clause 0x1
-; GFX12-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-EXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0
-; GFX12-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX12-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0
-; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0
-; GFX12-EXPAND-NEXT: s_add_co_i32 s0, s0, s1
-; GFX12-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX12-EXPAND-NEXT: global_store_b32 v0, v1, s[4:5]
-; GFX12-EXPAND-NEXT: s_endpgm
-;
-; GFX12-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
-; GFX12-NOEXPAND: ; %bb.0:
-; GFX12-NOEXPAND-NEXT: s_clause 0x1
-; GFX12-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NOEXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0
-; GFX12-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX12-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0
-; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0
-; GFX12-NOEXPAND-NEXT: s_add_co_i32 s0, s0, s1
-; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v1, s[4:5]
-; GFX12-NOEXPAND-NEXT: s_endpgm
-
- %scalar_val1 = load i32, ptr addrspace(4) %scalar_ptr_a, align 4
- %scalar_val2 = load i32, ptr addrspace(4) %scalar_ptr_b, align 4
-
- %result = add i32 %scalar_val1, %scalar_val2
- store i32 %result, ptr addrspace(1) %out, align 4
- ret void
-}
-
-; Test that expansion is NOT applied when counters are out-of-order (mixed event types).
-; In pre-GFX12, LDS and SMEM operations both use DS_CNT (lgkmcnt), but they can complete
-; out-of-order relative to each other. When both are in-flight, we should NOT expand
-; because the expansion would be misleading.
-define amdgpu_kernel void @test_outoforder_lds_and_smem(ptr addrspace(3) %lds_ptr, ptr addrspace(4) %smem_ptr, ptr addrspace(1) %out) #0 {
-; GFX9-EXPAND-LABEL: test_outoforder_lds_and_smem:
-; GFX9-EXPAND: ; %bb.0:
-; GFX9-EXPAND-NEXT: s_load_dword s6, s[4:5], 0x24
-; GFX9-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-EXPAND-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-EXPAND-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-EXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
-; GFX9-EXPAND-NEXT: s_load_dword s0, s[0:1], 0x0
-; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, v0, v1
-; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, s0, v0
-; GFX9-EXPAND-NEXT: global_store_dword v2, v0, s[2:3]
-; GFX9-EXPAND-NEXT: s_endpgm
-;
-; GFX9-NOEXPAND-LABEL: test_outoforder_lds_and_smem:
-; GFX9-NOEXPAND: ; %bb.0:
-; GFX9-NOEXPAND-NEXT: s_load_dword s6, s[4:5], 0x24
-; GFX9-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
-; GFX9-NOEXPAND-NEXT: s_load_dword s0, s[0:1], 0x0
-; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, v0, v1
-; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, s0, v0
-; GFX9-NOEXPAND-NEXT: global_store_dword v2, v0, s[2:3]
-; GFX9-NOEXPAND-NEXT: s_endpgm
-;
-; GFX10-EXPAND-LABEL: test_outoforder_lds_and_smem:
-; GFX10-EXPAND: ; %bb.0:
-; GFX10-EXPAND-NEXT: s_clause 0x1
-; GFX10-EXPAND-NEXT: s_load_dword s6, s[4:5], 0x24
-; GFX10-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-EXPAND-NEXT: v_mov_b32_e32 v0, s6
-; GFX10-EXPAND-NEXT: s_load_dword s0, s[0:1], 0x0
-; GFX10-EXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
-; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1
-; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX10-EXPAND-NEXT: global_store_dword v1, v0, s[2:3]
-; GFX10-EXPAND-NEXT: s_endpgm
-;
-; GFX10-NOEXPAND-LABEL: test_outoforder_lds_and_smem:
-; GFX10-NOEXPAND: ; %bb.0:
-; GFX10-NOEXPAND-NEXT: s_clause 0x1
-; GFX10-NOEXPAND-NEXT: s_load_dword s6, s[4:5], 0x24
-; GFX10-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6
-; GFX10-NOEXPAND-NEXT: s_load_dword s0, s[0:1], 0x0
-; GFX10-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
-; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1
-; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX10-NOEXPAND-NEXT: global_store_dword v1, v0, s[2:3]
-; GFX10-NOEXPAND-NEXT: s_endpgm
-;
-; GFX11-EXPAND-LABEL: test_outoforder_lds_and_smem:
-; GFX11-EXPAND: ; %bb.0:
-; GFX11-EXPAND-NEXT: s_clause 0x1
-; GFX11-EXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24
-; GFX11-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-EXPAND-NEXT: v_mov_b32_e32 v0, s6
-; GFX11-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX11-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1
-; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
-; GFX11-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-EXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX11-EXPAND-NEXT: global_store_b32 v1, v0, s[2:3]
-; GFX11-EXPAND-NEXT: s_endpgm
-;
-; GFX11-NOEXPAND-LABEL: test_outoforder_lds_and_smem:
-; GFX11-NOEXPAND: ; %bb.0:
-; GFX11-NOEXPAND-NEXT: s_clause 0x1
-; GFX11-NOEXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24
-; GFX11-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6
-; GFX11-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX11-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1
-; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
-; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX11-NOEXPAND-NEXT: global_store_b32 v1, v0, s[2:3]
-; GFX11-NOEXPAND-NEXT: s_endpgm
-;
-; GFX12-EXPAND-LABEL: test_outoforder_lds_and_smem:
-; GFX12-EXPAND: ; %bb.0:
-; GFX12-EXPAND-NEXT: s_clause 0x1
-; GFX12-EXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24
-; GFX12-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0
-; GFX12-EXPAND-NEXT: v_mov_b32_e32 v0, s6
-; GFX12-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX12-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1
-; GFX12-EXPAND-NEXT: s_wait_dscnt 0x0
-; GFX12-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
-; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0
-; GFX12-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-EXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX12-EXPAND-NEXT: global_store_b32 v1, v0, s[2:3]
-; GFX12-EXPAND-NEXT: s_endpgm
-;
-; GFX12-NOEXPAND-LABEL: test_outoforder_lds_and_smem:
-; GFX12-NOEXPAND: ; %bb.0:
-; GFX12-NOEXPAND-NEXT: s_clause 0x1
-; GFX12-NOEXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24
-; GFX12-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0
-; GFX12-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6
-; GFX12-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX12-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1
-; GFX12-NOEXPAND-NEXT: s_wait_dscnt 0x0
-; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
-; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0
-; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX12-NOEXPAND-NEXT: global_store_b32 v1, v0, s[2:3]
-; GFX12-NOEXPAND-NEXT: s_endpgm
-
- %lds_val1 = load i32, ptr addrspace(3) %lds_ptr, align 4
- %smem_val = load i32, ptr addrspace(4) %smem_ptr, align 4
- %lds_ptr2 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 1
- %lds_val2 = load i32, ptr addrspace(3) %lds_ptr2, align 4
- %sum1 = add i32 %lds_val1, %lds_val2
- %sum2 = add i32 %sum1, %smem_val
- store i32 %sum2, ptr addrspace(1) %out, align 4
- ret void
-}
-
-define amdgpu_kernel void @test_vscnt_global_stores(ptr addrspace(1) %buf) #0 {
-; Test vector memory stores (STORE_CNT/vscnt on GFX10-11, storecnt on GFX12+)
-; GFX9-EXPAND-LABEL: test_vscnt_global_stores:
-; GFX9-EXPAND: ; %bb.0: ; %entry
-; GFX9-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, 2
-; GFX9-EXPAND-NEXT: v_mov_b32_e32 v2, 1
-; GFX9-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-EXPAND-NEXT: global_store_dword v0, v2, s[0:1]
-; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:256
-; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, 3
-; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:512
-; GFX9-EXPAND-NEXT: s_waitcnt vmcnt(0)
-; GFX9-EXPAND-NEXT: s_endpgm
-;
-; GFX9-NOEXPAND-LABEL: test_vscnt_global_stores:
-; GFX9-NOEXPAND: ; %bb.0: ; %entry
-; GFX9-NOEXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v1, 2
-; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v2, 1
-; GFX9-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1]
-; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:256
-; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v1, 3
-; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:512
-; GFX9-NOEXPAND-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NOEXPAND-NEXT: s_endpgm
-;
-; GFX10-EXPAND-LABEL: test_vscnt_global_stores:
-; GFX10-EXPAND: ; %bb.0: ; %entry
-; GFX10-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, 1
-; GFX10-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-EXPAND-NEXT: v_mov_b32_e32 v2, 2
-; GFX10-EXPAND-NEXT: v_mov_b32_e32 v3, 3
-; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-EXPAND-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX10-EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:256
-; GFX10-EXPAND-NEXT: global_store_dword v0, v3, s[0:1] offset:512
-; GFX10-EXPAND-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-EXPAND-NEXT: s_endpgm
-;
-; GFX10-NOEXPAND-LABEL: test_vscnt_global_stores:
-; GFX10-NOEXPAND: ; %bb.0: ; %entry
-; GFX10-NOEXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, 1
-; GFX10-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v2, 2
-; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v3, 3
-; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX10-NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:256
-; GFX10-NOEXPAND-NEXT: global_store_dword v0, v3, s[0:1] offset:512
-; GFX10-NOEXPAND-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NOEXPAND-NEXT: s_endpgm
-;
-; GFX11-EXPAND-LABEL: test_vscnt_global_stores:
-; GFX11-EXPAND: ; %bb.0: ; %entry
-; GFX11-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-EXPAND-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-EXPAND-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3
-; GFX11-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-EXPAND-NEXT: s_clause 0x2
-; GFX11-EXPAND-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-EXPAND-NEXT: global_store_b32 v0, v2, s[0:1] offset:256
-; GFX11-EXPAND-NEXT: global_store_b32 v0, v3, s[0:1] offset:512
-; GFX11-EXPAND-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-EXPAND-NEXT: s_endpgm
-;
-; GFX11-NOEXPAND-LABEL: test_vscnt_global_stores:
-; GFX11-NOEXPAND: ; %bb.0: ; %entry
-; GFX11-NOEXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3
-; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NOEXPAND-NEXT: s_clause 0x2
-; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v2, s[0:1] offset:256
-; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v3, s[0:1] offset:512
-; GFX11-NOEXPAND-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NOEXPAND-NEXT: s_endpgm
-;
-; GFX12-EXPAND-LABEL: test_vscnt_global_stores:
-; GFX12-EXPAND: ; %bb.0: ; %entry
-; GFX12-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-EXPAND-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-EXPAND-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3
-; GFX12-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0
-; GFX12-EXPAND-NEXT: s_clause 0x2
-; GFX12-EXPAND-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX12-EXPAND-NEXT: global_store_b32 v0, v2, s[0:1] offset:256
-; GFX12-EXPAND-NEXT: global_store_b32 v0, v3, s[0:1] offset:512
-; GFX12-EXPAND-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-EXPAND-NEXT: s_wait_storecnt 0x0
-; GFX12-EXPAND-NEXT: s_endpgm
-;
-; GFX12-NOEXPAND-LABEL: test_vscnt_global_stores:
-; GFX12-NOEXPAND: ; %bb.0: ; %entry
-; GFX12-NOEXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3
-; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0
-; GFX12-NOEXPAND-NEXT: s_clause 0x2
-; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v2, s[0:1] offset:256
-; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v3, s[0:1] offset:512
-; GFX12-NOEXPAND-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NOEXPAND-NEXT: s_wait_storecnt 0x0
-; GFX12-NOEXPAND-NEXT: s_endpgm
-entry:
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid64 = zext i32 %tid to i64
-
- ; Issue multiple stores
- %ptr0 = getelementptr i32, ptr addrspace(1) %buf, i64 %tid64
- store i32 1, ptr addrspace(1) %ptr0, align 4
-
- %offset1 = add i64 %tid64, 64
- %ptr1 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset1
- store i32 2, ptr addrspace(1) %ptr1, align 4
-
- %offset2 = add i64 %tid64, 128
- %ptr2 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset2
- store i32 3, ptr addrspace(1) %ptr2, align 4
-
- ; Memory fence forces wait for all stores
- fence release
- ret void
-}
-
-define amdgpu_ps void @test_expcnt_exports(float %x, float %y, float %z, float %w) #0 {
-; Test export operations (EXP_CNT/expcnt)
-; GFX9-EXPAND-LABEL: test_expcnt_exports:
-; GFX9-EXPAND: ; %bb.0: ; %entry
-; GFX9-EXPAND-NEXT: v_mov_b32_e32 v4, 1.0
-; GFX9-EXPAND-NEXT: exp mrt0 v0, v1, v2, v3
-; GFX9-EXPAND-NEXT: exp mrt1 v3, v2, v1, v0
-; GFX9-EXPAND-NEXT: exp mrt2 v0, v3, v1, v2
-; GFX9-EXPAND-NEXT: exp param0 v4, v4, v4, v4 done
-; GFX9-EXPAND-NEXT: s_endpgm
-;
-; GFX9-NOEXPAND-LABEL: test_expcnt_exports:
-; GFX9-NOEXPAND: ; %bb.0: ; %entry
-; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v4, 1.0
-; GFX9-NOEXPAND-NEXT: exp mrt0 v0, v1, v2, v3
-; GFX9-NOEXPAND-NEXT: exp mrt1 v3, v2, v1, v0
-; GFX9-NOEXPAND-NEXT: exp mrt2 v0, v3, v1, v2
-; GFX9-NOEXPAND-NEXT: exp param0 v4, v4, v4, v4 done
-; GFX9-NOEXPAND-NEXT: s_endpgm
-;
-; GFX10-EXPAND-LABEL: test_expcnt_exports:
-; GFX10-EXPAND: ; %bb.0: ; %entry
-; GFX10-EXPAND-NEXT: v_mov_b32_e32 v4, 1.0
-; GFX10-EXPAND-NEXT: exp mrt0 v0, v1, v2, v3
-; GFX10-EXPAND-NEXT: exp mrt1 v3, v2, v1, v0
-; GFX10-EXPAND-NEXT: exp mrt2 v0, v3, v1, v2
-; GFX10-EXPAND-NEXT: exp param0 v4, v4, v4, v4 done
-; GFX10-EXPAND-NEXT: s_endpgm
-;
-; GFX10-NOEXPAND-LABEL: test_expcnt_exports:
-; GFX10-NOEXPAND: ; %bb.0: ; %entry
-; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v4, 1.0
-; GFX10-NOEXPAND-NEXT: exp mrt0 v0, v1, v2, v3
-; GFX10-NOEXPAND-NEXT: exp mrt1 v3, v2, v1, v0
-; GFX10-NOEXPAND-NEXT: exp mrt2 v0, v3, v1, v2
-; GFX10-NOEXPAND-NEXT: exp param0 v4, v4, v4, v4 done
-; GFX10-NOEXPAND-NEXT: s_endpgm
-;
-; GFX11-EXPAND-LABEL: test_expcnt_exports:
-; GFX11-EXPAND: ; %bb.0: ; %entry
-; GFX11-EXPAND-NEXT: v_mov_b32_e32 v4, 1.0
-; GFX11-EXPAND-NEXT: exp mrt0 v0, v1, v2, v3
-; GFX11-EXPAND-NEXT: exp mrt1 v3, v2, v1, v0
-; GFX11-EXPAND-NEXT: exp mrt2 v0, v3, v1, v2
-; GFX11-EXPAND-NEXT: exp invalid_target_32 v4, v4, v4, v4 done
-; GFX11-EXPAND-NEXT: s_endpgm
-;
-; GFX11-NOEXPAND-LABEL: test_expcnt_exports:
-; GFX11-NOEXPAND: ; %bb.0: ; %entry
-; GFX11-NOEXPAND-NEXT: v_mov_b32_e32 v4, 1.0
-; GFX11-NOEXPAND-NEXT: exp mrt0 v0, v1, v2, v3
-; GFX11-NOEXPAND-NEXT: exp mrt1 v3, v2, v1, v0
-; GFX11-NOEXPAND-NEXT: exp mrt2 v0, v3, v1, v2
-; GFX11-NOEXPAND-NEXT: exp invalid_target_32 v4, v4, v4, v4 done
-; GFX11-NOEXPAND-NEXT: s_endpgm
-;
-; GFX12-EXPAND-LABEL: test_expcnt_exports:
-; GFX12-EXPAND: ; %bb.0: ; %entry
-; GFX12-EXPAND-NEXT: v_mov_b32_e32 v4, 1.0
-; GFX12-EXPAND-NEXT: export mrt0 v0, v1, v2, v3
-; GFX12-EXPAND-NEXT: export mrt1 v3, v2, v1, v0
-; GFX12-EXPAND-NEXT: export mrt2 v0, v3, v1, v2
-; GFX12-EXPAND-NEXT: export invalid_target_32 v4, v4, v4, v4 done
-; GFX12-EXPAND-NEXT: s_endpgm
-;
-; GFX12-NOEXPAND-LABEL: test_expcnt_exports:
-; GFX12-NOEXPAND: ; %bb.0: ; %entry
-; GFX12-NOEXPAND-NEXT: v_mov_b32_e32 v4, 1.0
-; GFX12-NOEXPAND-NEXT: export mrt0 v0, v1, v2, v3
-; GFX12-NOEXPAND-NEXT: export mrt1 v3, v2, v1, v0
-; GFX12-NOEXPAND-NEXT: export mrt2 v0, v3, v1, v2
-; GFX12-NOEXPAND-NEXT: export invalid_target_32 v4, v4, v4, v4 done
-; GFX12-NOEXPAND-NEXT: s_endpgm
-entry:
- ; Multiple MRT exports
- call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 false, i1 false)
- call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %w, float %z, float %y, float %x, i1 false, i1 false)
- call void @llvm.amdgcn.exp.f32(i32 2, i32 15, float %x, float %w, float %y, float %z, i1 false, i1 false)
- ; Final export with done bit
- call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 1.0, float 1.0, float 1.0, i1 true, i1 false)
- ret void
-}
-
-declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1)
-
-attributes #0 = { nounwind ATTRS }
More information about the llvm-branch-commits
mailing list