[llvm] [AMDGPU] w/a hazard with writing s102/103 and reading FLAT_SCRATCH_BASE (PR #153878)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 15 14:21:08 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Stanislav Mekhanoshin (rampitec)
<details>
<summary>Changes</summary>
---
Patch is 21.85 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/153878.diff
4 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (+78)
- (modified) llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h (+1)
- (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+6)
- (modified) llvm/test/CodeGen/AMDGPU/hazards-gfx1250.mir (+478)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 5e297c7540c48..dd7c1914d3440 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -1204,6 +1204,8 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
fixGetRegWaitIdle(MI);
if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
fixDsAtomicAsyncBarrierArriveB64(MI);
+ if (ST.hasScratchBaseForwardingHazard())
+ fixScratchBaseForwardingHazard(MI);
}
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI,
@@ -3468,3 +3470,79 @@ bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI) {
return true;
}
+
+bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) {
+ // No reason to check this in pre-RA scheduling, SGPRs have to be allocated
+ // for hazard to trigger.
+ if (!IsHazardRecognizerMode)
+ return false;
+
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ // Hazard expires after 10 SGPR writes by SALU or 8 SGPR writes by VALU.
+ const int FlatScrBaseWaitStates = 10;
+
+ bool ReadsFlatScrLo =
+ MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
+ bool ReadsFlatScrHi =
+ MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
+ if (isSGetReg(MI->getOpcode())) {
+ switch (getHWReg(TII, *MI)) {
+ default:
+ break;
+ case AMDGPU::Hwreg::ID_FLAT_SCR_LO:
+ ReadsFlatScrLo = true;
+ break;
+ case AMDGPU::Hwreg::ID_FLAT_SCR_HI:
+ ReadsFlatScrHi = true;
+ break;
+ }
+ }
+
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ auto IsRegDefHazard = [&](Register Reg) -> bool {
+ DenseSet<const MachineBasicBlock *> Visited;
+ auto IsHazardFn = [TRI, Reg](const MachineInstr &MI) {
+ return MI.modifiesRegister(Reg, TRI);
+ };
+
+ // This literally abuses the idea of waitstates. Instead of waitstates it
+ // returns 1 for SGPR written and 0 otherwise.
+ auto IsSGPRDef = [TII, TRI, &MRI](const MachineInstr &MI) -> unsigned {
+ if (!TII->isSALU(MI) && !TII->isVALU(MI))
+ return 0;
+ for (const MachineOperand &MO : MI.all_defs()) {
+ if (TRI->isSGPRReg(MRI, MO.getReg()))
+ return 1;
+ }
+ return 0;
+ };
+
+ auto IsExpiredFn = [=](const MachineInstr &MI, int SgprWrites) {
+ if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
+ unsigned Wait = MI.getOperand(0).getImm();
+ if (AMDGPU::DepCtr::decodeFieldSaSdst(Wait) == 0 &&
+ AMDGPU::DepCtr::decodeFieldVaSdst(Wait) == 0)
+ return true;
+ }
+ return SgprWrites >= FlatScrBaseWaitStates;
+ };
+
+ return ::getWaitStatesSince(
+ IsHazardFn, MI->getParent(), std::next(MI->getReverseIterator()),
+ 0, IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
+ };
+
+ if ((!ReadsFlatScrLo || MRI.isConstantPhysReg(AMDGPU::SGPR102) ||
+ !IsRegDefHazard(AMDGPU::SGPR102)) &&
+ (!ReadsFlatScrHi || MRI.isConstantPhysReg(AMDGPU::SGPR103) ||
+ !IsRegDefHazard(AMDGPU::SGPR103)))
+ return false;
+
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::S_WAITCNT_DEPCTR))
+ .addImm(AMDGPU::DepCtr::encodeFieldVaSdst(
+ AMDGPU::DepCtr::encodeFieldSaSdst(0), 0));
+ return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 890d5cbd154d6..e0982b46424b9 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -112,6 +112,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
bool fixRequiredExportPriority(MachineInstr *MI);
bool fixGetRegWaitIdle(MachineInstr *MI);
bool fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI);
+ bool fixScratchBaseForwardingHazard(MachineInstr *MI);
int checkMAIHazards(MachineInstr *MI);
int checkMAIHazards908(MachineInstr *MI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 436f5c0801fad..404a476a3076a 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1821,6 +1821,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasDsAtomicAsyncBarrierArriveB64PipeBug() const {
return getGeneration() == GFX12;
}
+
+ // Requires s_wait_alu(0) after s102/s103 write and src_flat_scratch_base
+ // read.
+ bool hasScratchBaseForwardingHazard() const {
+ return GFX1250Insts && getGeneration() == GFX12;
+ }
};
class GCNUserSGPRUsageInfo {
diff --git a/llvm/test/CodeGen/AMDGPU/hazards-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/hazards-gfx1250.mir
index f1dbabf1e1a83..f4596b0832d97 100644
--- a/llvm/test/CodeGen/AMDGPU/hazards-gfx1250.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazards-gfx1250.mir
@@ -15,3 +15,481 @@ body: |
; GCN-NEXT: S_WAITCNT_DEPCTR 65507
DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 $vgpr1, 0, 0, implicit-def $asynccnt, implicit $asynccnt, implicit $exec
...
+
+---
+name: write_s102_read_flat_scr_base_lo
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GCN-LABEL: name: write_s102_read_flat_scr_base_lo
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr102 = S_MOV_B32 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 61950
+ ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+ $sgpr102 = S_MOV_B32 0
+ $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+...
+
+---
+name: write_s103_read_flat_scr_base_hi
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GCN-LABEL: name: write_s103_read_flat_scr_base_hi
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr103 = S_MOV_B32 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 61950
+ ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_hi, $vgpr0, implicit $exec
+ $sgpr103 = S_MOV_B32 0
+ $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_hi, $vgpr0, implicit $exec
+...
+
+---
+name: write_s102_read_flat_scr_base
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: write_s102_read_flat_scr_base
+ ; GCN: $sgpr102 = S_MOV_B32 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 61950
+ ; GCN-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $src_flat_scratch_base
+ $sgpr102 = S_MOV_B32 0
+ $sgpr0_sgpr1 = S_MOV_B64 $src_flat_scratch_base
+...
+
+---
+name: write_s103_read_flat_scr_base
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: write_s103_read_flat_scr_base
+ ; GCN: $sgpr103 = S_MOV_B32 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 61950
+ ; GCN-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $src_flat_scratch_base
+ $sgpr103 = S_MOV_B32 0
+ $sgpr0_sgpr1 = S_MOV_B64 $src_flat_scratch_base
+...
+
+---
+name: write_s102_s103_read_flat_scr_base
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: write_s102_s103_read_flat_scr_base
+ ; GCN: $sgpr102_sgpr103 = S_MOV_B64 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 61950
+ ; GCN-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $src_flat_scratch_base
+ $sgpr102_sgpr103 = S_MOV_B64 0
+ $sgpr0_sgpr1 = S_MOV_B64 $src_flat_scratch_base
+...
+
+---
+name: write_s102_getreg_flat_scr_base_lo
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GCN-LABEL: name: write_s102_getreg_flat_scr_base_lo
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr102 = S_MOV_B32 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 61950
+ ; GCN-NEXT: $sgpr1 = S_GETREG_B32 20, implicit $mode
+ $sgpr102 = S_MOV_B32 0
+ $sgpr1 = S_GETREG_B32 20, implicit $mode
+...
+
+---
+name: write_s103_getreg_flat_scr_base_hi
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GCN-LABEL: name: write_s103_getreg_flat_scr_base_hi
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr103 = S_MOV_B32 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 61950
+ ; GCN-NEXT: $sgpr1 = S_GETREG_B32 21, implicit $mode
+ $sgpr103 = S_MOV_B32 0
+ $sgpr1 = S_GETREG_B32 21, implicit $mode
+...
+
+---
+name: write_s102_s103_getreg_flat_scr_base_hi
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: write_s102_s103_getreg_flat_scr_base_hi
+ ; GCN: $sgpr102_sgpr103 = S_MOV_B64 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 61950
+ ; GCN-NEXT: $sgpr1 = S_GETREG_B32 21, implicit $mode
+ $sgpr102_sgpr103 = S_MOV_B64 0
+ $sgpr1 = S_GETREG_B32 21, implicit $mode
+...
+
+---
+name: write_s102_read_flat_scr_base_lo_9_salu_valu
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GCN-LABEL: name: write_s102_read_flat_scr_base_lo_9_salu_valu
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr102 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr1 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr2_sgpr3 = S_MOV_B64 0
+ ; GCN-NEXT: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr0, $sgpr0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr2 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
+ ; GCN-NEXT: S_NOP 0
+ ; GCN-NEXT: $vgpr3 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec
+ ; GCN-NEXT: $sgpr4 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr5 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr6 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr7 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr8_sgpr9 = S_LOAD_DWORDX2_IMM renamable $sgpr4_sgpr5, 0, 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 61950
+ ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+ $sgpr102 = S_MOV_B32 0
+ $sgpr0 = S_MOV_B32 0
+ $sgpr1 = S_MOV_B32 0
+ $sgpr2_sgpr3 = S_MOV_B64 0
+ $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr0, $sgpr0, 0, implicit $exec
+ $vgpr2 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+ ; NOP does not count because it does not write SGPRs
+ S_NOP 0
+ ; DS_READ_B32 does not count because it is not SALU or VALU
+ $vgpr3 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec
+ $sgpr4 = S_MOV_B32 0
+ $sgpr5 = S_MOV_B32 0
+ $sgpr6 = S_MOV_B32 0
+ $sgpr7 = S_MOV_B32 0
+ ; S_LOAD_DWORDX2_IMM does not count because it is not SALU
+ $sgpr8_sgpr9 = S_LOAD_DWORDX2_IMM renamable $sgpr4_sgpr5, 0, 0
+ $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+...
+
+---
+name: write_s102_read_flat_scr_base_lo_10_salu_valu_expired
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GCN-LABEL: name: write_s102_read_flat_scr_base_lo_10_salu_valu_expired
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr102 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr1 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr2_sgpr3 = S_MOV_B64 0
+ ; GCN-NEXT: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr0, $sgpr0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr2 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
+ ; GCN-NEXT: S_NOP 0
+ ; GCN-NEXT: $vgpr3 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec
+ ; GCN-NEXT: $sgpr4 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr5 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr6 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr7 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr8_sgpr9 = S_LOAD_DWORDX2_IMM renamable $sgpr4_sgpr5, 0, 0
+ ; GCN-NEXT: $sgpr10 = S_MOV_B32 0
+ ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+ $sgpr102 = S_MOV_B32 0
+ $sgpr0 = S_MOV_B32 0
+ $sgpr1 = S_MOV_B32 0
+ $sgpr2_sgpr3 = S_MOV_B64 0
+ $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr0, $sgpr0, 0, implicit $exec
+ $vgpr2 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+ ; NOP does not count because it does not write SGPRs
+ S_NOP 0
+ ; DS_READ_B32 does not count because it is not SALU or VALU
+ $vgpr3 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec
+ $sgpr4 = S_MOV_B32 0
+ $sgpr5 = S_MOV_B32 0
+ $sgpr6 = S_MOV_B32 0
+ $sgpr7 = S_MOV_B32 0
+ ; S_LOAD_DWORDX2_IMM does not count because it is not SALU
+ $sgpr8_sgpr9 = S_LOAD_DWORDX2_IMM renamable $sgpr4_sgpr5, 0, 0
+ $sgpr10 = S_MOV_B32 0
+ $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+...
+
+---
+name: write_s103_read_flat_scr_base_hi_9_salu_valu
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GCN-LABEL: name: write_s103_read_flat_scr_base_hi_9_salu_valu
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr103 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr1 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr2_sgpr3 = S_MOV_B64 0
+ ; GCN-NEXT: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr0, $sgpr0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr2 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
+ ; GCN-NEXT: S_NOP 0
+ ; GCN-NEXT: $vgpr3 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec
+ ; GCN-NEXT: $sgpr4 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr5 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr6 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr7 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr8_sgpr9 = S_LOAD_DWORDX2_IMM renamable $sgpr4_sgpr5, 0, 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 61950
+ ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_hi, $vgpr0, implicit $exec
+ $sgpr103 = S_MOV_B32 0
+ $sgpr0 = S_MOV_B32 0
+ $sgpr1 = S_MOV_B32 0
+ $sgpr2_sgpr3 = S_MOV_B64 0
+ $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr0, $sgpr0, 0, implicit $exec
+ $vgpr2 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+ ; NOP does not count because it does not write SGPRs
+ S_NOP 0
+ ; DS_READ_B32 does not count because it is not SALU or VALU
+ $vgpr3 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec
+ $sgpr4 = S_MOV_B32 0
+ $sgpr5 = S_MOV_B32 0
+ $sgpr6 = S_MOV_B32 0
+ $sgpr7 = S_MOV_B32 0
+ ; S_LOAD_DWORDX2_IMM does not count because it is not SALU
+ $sgpr8_sgpr9 = S_LOAD_DWORDX2_IMM renamable $sgpr4_sgpr5, 0, 0
+ $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_hi, $vgpr0, implicit $exec
+...
+
+---
+name: write_s103_read_flat_scr_base_hi_10_salu_valu_expired
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GCN-LABEL: name: write_s103_read_flat_scr_base_hi_10_salu_valu_expired
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr103 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr1 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr2_sgpr3 = S_MOV_B64 0
+ ; GCN-NEXT: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr0, $sgpr0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr2 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
+ ; GCN-NEXT: S_NOP 0
+ ; GCN-NEXT: $vgpr3 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec
+ ; GCN-NEXT: $sgpr4 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr5 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr6 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr7 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr8_sgpr9 = S_LOAD_DWORDX2_IMM renamable $sgpr4_sgpr5, 0, 0
+ ; GCN-NEXT: $sgpr10 = S_MOV_B32 0
+ ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_hi, $vgpr0, implicit $exec
+ $sgpr103 = S_MOV_B32 0
+ $sgpr0 = S_MOV_B32 0
+ $sgpr1 = S_MOV_B32 0
+ $sgpr2_sgpr3 = S_MOV_B64 0
+ $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr0, $sgpr0, 0, implicit $exec
+ $vgpr2 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+ ; NOP does not count because it does not write SGPRs
+ S_NOP 0
+ ; DS_READ_B32 does not count because it is not SALU or VALU
+ $vgpr3 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec
+ $sgpr4 = S_MOV_B32 0
+ $sgpr5 = S_MOV_B32 0
+ $sgpr6 = S_MOV_B32 0
+ $sgpr7 = S_MOV_B32 0
+ ; S_LOAD_DWORDX2_IMM does not count because it is not SALU
+ $sgpr8_sgpr9 = S_LOAD_DWORDX2_IMM renamable $sgpr4_sgpr5, 0, 0
+ $sgpr10 = S_MOV_B32 0
+ $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_hi, $vgpr0, implicit $exec
+...
+
+---
+name: write_s102_read_flat_scr_base_hi_no_hazard
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GCN-LABEL: name: write_s102_read_flat_scr_base_hi_no_hazard
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr102 = S_MOV_B32 0
+ ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_hi, $vgpr0, implicit $exec
+ $sgpr102 = S_MOV_B32 0
+ $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_hi, $vgpr0, implicit $exec
+...
+
+---
+name: write_s102_read_flat_scr_base_lo_expired_by_wait0
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GCN-LABEL: name: write_s102_read_flat_scr_base_lo_expired_by_wait0
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr102 = S_MOV_B32 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 0
+ ; GCN-NEXT: S_NOP 0
+ ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+ $sgpr102 = S_MOV_B32 0
+ S_WAITCNT_DEPCTR 0
+ S_NOP 0
+ $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+...
+
+---
+name: write_s102_read_flat_scr_base_lo_expired_by_wait_vs_sdst_sa_sdst
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GCN-LABEL: name: write_s102_read_flat_scr_base_lo_expired_by_wait_vs_sdst_sa_sdst
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr102 = S_MOV_B32 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 61950
+ ; GCN-NEXT: S_NOP 0
+ ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+ $sgpr102 = S_MOV_B32 0
+ S_WAITCNT_DEPCTR 61950
+ S_NOP 0
+ $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+...
+
+---
+name: write_s102_read_flat_scr_base_lo_not_expired_by_wait_va_sdst_only
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GCN-LABEL: name: write_s102_read_flat_scr_base_lo_not_expired_by_wait_va_sdst_only
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr102 = S_MOV_B32 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 61951
+ ; GCN-NEXT: S_NOP 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 61950
+ ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+ $sgpr102 = S_MOV_B32 0
+ S_WAITCNT_DEPCTR 61951
+ S_NOP 0
+ $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+...
+
+---
+name: write_s102_read_flat_scr_base_lo_not_expired_by_wait_sa_sdst_only
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GCN-LABEL: name: write_s102_read_flat_scr_base_lo_not_expired_by_wait_sa_sdst_only
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr102 = S_MOV_B32 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-NEXT: S_NOP 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 61950
+ ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+ $sgpr102 = S_MOV_B32 0
+ S_WAITCNT_DEPCTR 65534
+ S_NOP 0
+ $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+...
+
+---
+name: write_s102_write_s103_read_flat_scr_base_lo_read_flat_scr_base_hi
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GCN-LABEL: name: write_s102_write_s103_read_flat_scr_base_lo_read_flat_scr_base_hi
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr102 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr103 = S_MOV_B32 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 61950
+ ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+ ; GCN-NEXT: $vgpr1 = V_ADD_U32_e32 $src_flat_scratch_base_hi, $vgpr0, implicit $exec
+ $sgpr102 = S_MOV_B32 0
+ $sgpr103 = S_MOV_B32 0
+ $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+ $vgpr1 = V_ADD_U32_e32 $src_flat_scratch_base_hi, $vgpr0, implicit $exec
+...
+
+---
+name: write_s102_read_flat_scr_base_lo_cross_blocks
+tracksRegLiveness: true
+body: |
+ ; GCN-LABEL: name: write_s102_read_flat_scr_base_lo_cross_blocks
+ ; GCN: bb.0:
+ ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; GCN-NEXT: liveins: $vgpr0, $sgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr102 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-NEXT: $sgpr1 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr2 = S_MOV_B32 0
+ ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/153878
More information about the llvm-commits
mailing list