[llvm] [AMDGPU] Do not optimize away pre-existing waitcnt instructions at -O0 (PR #90716)
via llvm-commits
llvm-commits at lists.llvm.org
Wed May 1 02:44:51 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Jay Foad (jayfoad)
<details>
<summary>Changes</summary>
The autogenerated memory legalizer tests use -O0 so this allows us to
see the exact waitcnts that were inserted by the memory legalizer
without them being optimized away.
---
Patch is 17.50 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/90716.diff
28 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+27-24)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll (+493-13)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll (+12950-4603)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll (+46-17)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll (+727-313)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll (+11394-4514)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll (+12950-4603)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll (+576-233)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll (+11234-4462)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll (+11869-4502)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll (+13401-6341)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll (+38-25)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll (+655-369)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll (+12012-6640)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll (+12709-6085)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll (+570-317)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll (+12012-6640)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll (+12919-6519)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll (+8124-3910)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll (+548-388)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll (+7578-3904)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll (+8124-3910)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll (+424-282)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll (+7578-3904)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll (+8124-3910)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll (+25-25)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll (+640-540)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll (+387-307)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 15a1db51c6d78b..0ccd9c20a328ed 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -448,12 +448,19 @@ class WaitcntGenerator {
const SIInstrInfo *TII = nullptr;
AMDGPU::IsaVersion IV;
InstCounterType MaxCounter;
+ bool OptNone;
public:
WaitcntGenerator() {}
- WaitcntGenerator(const GCNSubtarget *ST, InstCounterType MaxCounter)
- : ST(ST), TII(ST->getInstrInfo()),
- IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter) {}
+ WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter)
+ : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
+ IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter),
+ OptNone(MF.getFunction().hasOptNone() ||
+ MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {}
+
+ // Return true if the current function should be compiled with no
+ // optimization.
+ bool isOptNone() const { return OptNone; }
// Edits an existing sequence of wait count instructions according
// to an incoming Waitcnt value, which is itself updated to reflect
@@ -504,8 +511,8 @@ class WaitcntGenerator {
class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
public:
WaitcntGeneratorPreGFX12() {}
- WaitcntGeneratorPreGFX12(const GCNSubtarget *ST)
- : WaitcntGenerator(ST, NUM_NORMAL_INST_CNTS) {}
+ WaitcntGeneratorPreGFX12(const MachineFunction &MF)
+ : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS) {}
bool
applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
@@ -539,8 +546,9 @@ class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
public:
WaitcntGeneratorGFX12Plus() {}
- WaitcntGeneratorGFX12Plus(const GCNSubtarget *ST, InstCounterType MaxCounter)
- : WaitcntGenerator(ST, MaxCounter) {}
+ WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
+ InstCounterType MaxCounter)
+ : WaitcntGenerator(MF, MaxCounter) {}
bool
applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
@@ -597,8 +605,6 @@ class SIInsertWaitcnts : public MachineFunctionPass {
bool ForceEmitZeroWaitcnts;
bool ForceEmitWaitcnt[NUM_INST_CNTS];
- bool OptNone;
-
// In any given run of this pass, WCG will point to one of these two
// generator objects, which must have been re-initialised before use
// from a value made using a subtarget constructor.
@@ -1203,19 +1209,19 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
continue;
unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
- bool IsSoft = Opcode != II.getOpcode();
+ bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
// Update required wait count. If this is a soft waitcnt (= it was added
// by an earlier pass), it may be entirely removed.
if (Opcode == AMDGPU::S_WAITCNT) {
unsigned IEnc = II.getOperand(0).getImm();
AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
- if (IsSoft)
+ if (TrySimplify)
ScoreBrackets.simplifyWaitcnt(OldWait);
Wait = Wait.combined(OldWait);
// Merge consecutive waitcnt of the same type by erasing multiples.
- if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && IsSoft)) {
+ if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
II.eraseFromParent();
Modified = true;
} else
@@ -1226,11 +1232,11 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
unsigned OldVSCnt =
TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
- if (IsSoft)
+ if (TrySimplify)
ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
- if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && IsSoft)) {
+ if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
II.eraseFromParent();
Modified = true;
} else
@@ -1356,13 +1362,13 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
// by an earlier pass), it may be entirely removed.
unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
- bool IsSoft = Opcode != II.getOpcode();
+ bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
unsigned OldEnc =
TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
- if (IsSoft)
+ if (TrySimplify)
ScoreBrackets.simplifyWaitcnt(OldWait);
Wait = Wait.combined(OldWait);
UpdatableInstr = &CombinedLoadDsCntInstr;
@@ -1370,7 +1376,7 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
unsigned OldEnc =
TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
- if (IsSoft)
+ if (TrySimplify)
ScoreBrackets.simplifyWaitcnt(OldWait);
Wait = Wait.combined(OldWait);
UpdatableInstr = &CombinedStoreDsCntInstr;
@@ -1379,7 +1385,7 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
assert(CT.has_value());
unsigned OldCnt =
TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
- if (IsSoft)
+ if (TrySimplify)
ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
addWait(Wait, CT.value(), OldCnt);
UpdatableInstr = &WaitInstrs[CT.value()];
@@ -1649,7 +1655,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// * we are not in Dynamic VGPR mode
else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
- if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !OptNone &&
+ if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !WCG->isOptNone() &&
ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
!ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
ReleaseVGPRInsts.insert(&MI);
@@ -2471,11 +2477,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
if (ST->hasExtendedWaitCounts()) {
MaxCounter = NUM_EXTENDED_INST_CNTS;
- WCGGFX12Plus = WaitcntGeneratorGFX12Plus(ST, MaxCounter);
+ WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter);
WCG = &WCGGFX12Plus;
} else {
MaxCounter = NUM_NORMAL_INST_CNTS;
- WCGPreGFX12 = WaitcntGeneratorPreGFX12(ST);
+ WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF);
WCG = &WCGPreGFX12;
}
@@ -2487,9 +2493,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
- OptNone = MF.getFunction().hasOptNone() ||
- MF.getTarget().getOptLevel() == CodeGenOptLevel::None;
-
HardwareLimits Limits = {};
if (ST->hasExtendedWaitCounts()) {
Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
index e13542f61474e2..4128bfe392dc75 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
@@ -1,17 +1,17 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
define amdgpu_kernel void @singlethread_acquire_fence() {
; GFX6-LABEL: singlethread_acquire_fence:
@@ -928,59 +928,77 @@ entry:
define amdgpu_kernel void @workgroup_acquire_fence() {
; GFX6-LABEL: workgroup_acquire_fence:
; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: workgroup_acquire_fence:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: workgroup_acquire_fence:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: workgroup_acquire_fence:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_acquire_fence:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: workgroup_acquire_fence:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: workgroup_acquire_fence:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: workgroup_acquire_fence:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: workgroup_acquire_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: workgroup_acquire_fence:
; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: workgroup_acquire_fence:
; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_acquire_fence:
; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT: s_wait_storecnt 0x0
+; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: workgroup_acquire_fence:
; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acquire
@@ -990,54 +1008,72 @@ entry:
define amdgpu_kernel void @workgroup_release_fence() {
; GFX6-LABEL: workgroup_release_fence:
; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: workgroup_release_fence:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: workgroup_release_fence:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: workgroup_release_fence:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_release_fence:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: workgroup_release_fence:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: workgroup_release_fence:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: workgroup_release_fence:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: workgroup_release_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: workgroup_release_fence:
; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: workgroup_release_fence:
; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_release_fence:
; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT: s_wait_storecnt 0x0
+; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: workgroup_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup") release
@@ -1047,59 +1083,77 @@ entry:
define amdgpu_kernel void @workgroup_acq_rel_fence() {
; GFX6-LABEL: workgroup_acq_rel_fence:
; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: workgroup_acq_rel_fence:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: workgroup_acq_rel_fence:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: workgroup_acq_rel_fence:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_acq_rel_fence:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: workgroup_acq_rel_fence:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: workgroup_acq_rel_fence:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: workgroup_acq_rel_fence:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: workgroup_acq_rel_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: workgroup_acq_rel_fence:
; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: workgroup_acq_rel_fence:
; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: workgroup_acq_rel_fence:
; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT: s_wait_storecnt 0x0
+; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: workgroup_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acq_rel
@@ -1109,59 +1163,77 @@ entry:
define amdgpu_kernel void @workgroup_seq_cst_fence() {
; GFX6-LABEL: workgroup_seq_cst_fence:
; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: workgroup_seq_cst_fence:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: workgroup_seq_cst_fence:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: workgroup_seq_cst_fence:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: workgroup_seq_cst_fence:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: workgroup_seq_cst_fence:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: workgroup_seq_cst_fence:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: workgroup_seq_cst_fence:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: workgroup_seq_cst_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: workgroup_seq_cst_fence:
; GFX11-WGP: ; %bb.0: ; %entry
+; GFX1...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/90716
More information about the llvm-commits
mailing list