[llvm] [AMDGPU] Add scheduling DAG mutation for hazard latencies (PR #170075)
Carl Ritson via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 8 00:15:38 PST 2025
https://github.com/perlfu updated https://github.com/llvm/llvm-project/pull/170075
>From 97cc017ab8d87178a02370528d96a32adc865c9b Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Fri, 28 Nov 2025 12:11:55 +0900
Subject: [PATCH 1/2] [AMDGPU] Add scheduling DAG mutation for hazard latencies
Improve waitcnt merging in ML kernel loops by increasing latencies
on VALU writes to SGPRs.
Specifically this helps with the case of V_CMP output feeding
V_CNDMASK instructions.
---
.../lib/Target/AMDGPU/AMDGPUHazardLatency.cpp | 78 ++++++++
llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.h | 24 +++
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 4 +
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 +
.../AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll | 2 +-
.../atomic_optimizations_global_pointer.ll | 77 ++++----
.../atomic_optimizations_local_pointer.ll | 96 +++++-----
.../AMDGPU/gfx11-sgpr-hazard-latency.mir | 169 ++++++++++++++++++
llvm/test/CodeGen/AMDGPU/global-saddr-load.ll | 20 +--
9 files changed, 369 insertions(+), 102 deletions(-)
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.cpp
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.h
create mode 100644 llvm/test/CodeGen/AMDGPU/gfx11-sgpr-hazard-latency.mir
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.cpp
new file mode 100644
index 0000000000000..2257154d68543
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.cpp
@@ -0,0 +1,78 @@
+//===--- AMDGPUHazardLatency.cpp - AMDGPU Hazard Latency Adjustment -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains a DAG scheduling mutation to adjust the
+/// latency of data edges between instructions which use registers
+/// potentially subject to additional hazard waits not accounted
+/// for in the normal scheduling model.
+/// While the scheduling model is typically still accurate in these
+/// scenarios, adjusting latency of relevant edges can improve wait
+/// merging and reduce pipeline impact of any required waits.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUHazardLatency.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+
+using namespace llvm;
+
+namespace {
+
+class HazardLatency : public ScheduleDAGMutation {
+private:
+ const GCNSubtarget *ST;
+ const SIRegisterInfo *TRI;
+ const MachineRegisterInfo *MRI;
+
+public:
+ HazardLatency(MachineFunction *MF) {
+ ST = &MF->getSubtarget<GCNSubtarget>();
+ TRI = ST->getRegisterInfo();
+ MRI = &MF->getRegInfo();
+ }
+ void apply(ScheduleDAGInstrs *DAG) override;
+};
+
+void HazardLatency::apply(ScheduleDAGInstrs *DAG) {
+ constexpr unsigned MaskLatencyBoost = 3;
+
+ if (!ST->hasVALUMaskWriteHazard() || !ST->isWave64())
+ return;
+
+ for (SUnit &SU : DAG->SUnits) {
+ const MachineInstr *MI = SU.getInstr();
+ if (!SIInstrInfo::isVALU(*MI))
+ continue;
+ if (MI->getOpcode() == AMDGPU::V_READLANE_B32 ||
+ MI->getOpcode() == AMDGPU::V_READFIRSTLANE_B32)
+ continue;
+ for (SDep &SuccDep : SU.Succs) {
+ if (SuccDep.isCtrl())
+ continue;
+ // Boost latency on VALU writes to SGPRs used by VALUs.
+ // Reduce risk of premature VALU pipeline stall on associated reads.
+ MachineInstr *DestMI = SuccDep.getSUnit()->getInstr();
+ if (!SIInstrInfo::isVALU(*DestMI))
+ continue;
+ Register Reg = SuccDep.getReg();
+ if (!TRI->isSGPRReg(*MRI, Reg))
+ continue;
+ SuccDep.setLatency(SuccDep.getLatency() * MaskLatencyBoost);
+ }
+ }
+}
+
+} // end namespace
+
+std::unique_ptr<ScheduleDAGMutation>
+llvm::createAMDGPUHazardLatencyDAGMutation(MachineFunction *MF) {
+ return std::make_unique<HazardLatency>(MF);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.h b/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.h
new file mode 100644
index 0000000000000..134cc27743cd1
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.h
@@ -0,0 +1,24 @@
+//===- AMDGPUHazardLatency.h - Hazard Latency Adjustment --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUHAZARDLATENCY_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUHAZARDLATENCY_H
+
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include <memory>
+
+namespace llvm {
+
+class MachineFunction;
+
+std::unique_ptr<ScheduleDAGMutation>
+createAMDGPUHazardLatencyDAGMutation(MachineFunction *MF);
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUHAZARDLATENCY_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index e5a35abe6da6b..5c3798c3f2309 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -21,6 +21,7 @@
#include "AMDGPUCtorDtorLowering.h"
#include "AMDGPUExportClustering.h"
#include "AMDGPUExportKernelRuntimeHandles.h"
+#include "AMDGPUHazardLatency.h"
#include "AMDGPUIGroupLP.h"
#include "AMDGPUISelDAGToDAG.h"
#include "AMDGPULowerVGPREncoding.h"
@@ -648,6 +649,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF));
+ DAG->addMutation(createAMDGPUHazardLatencyDAGMutation(C->MF));
return DAG;
}
@@ -669,6 +671,7 @@ createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) {
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF));
+ DAG->addMutation(createAMDGPUHazardLatencyDAGMutation(C->MF));
return DAG;
}
@@ -1210,6 +1213,7 @@ GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const {
DAG->addMutation(createVOPDPairingMutation());
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF));
+ DAG->addMutation(createAMDGPUHazardLatencyDAGMutation(C->MF));
return DAG;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 4baae51e021c5..583ec8d7898e8 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -63,6 +63,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUFrameLowering.cpp
AMDGPUGlobalISelDivergenceLowering.cpp
AMDGPUGlobalISelUtils.cpp
+ AMDGPUHazardLatency.cpp
AMDGPUHSAMetadataStreamer.cpp
AMDGPUInsertDelayAlu.cpp
AMDGPUInstCombineIntrinsic.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index 6846137272ec6..aa25294ba17b6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -820,8 +820,8 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d
; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11_W64-NEXT: s_and_b32 s8, 1, s8
; GFX11_W64-NEXT: v_mov_b32_e32 v0, s4
-; GFX11_W64-NEXT: v_mov_b32_e32 v2, s6
; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s8
+; GFX11_W64-NEXT: v_mov_b32_e32 v2, s6
; GFX11_W64-NEXT: v_mov_b32_e32 v1, s5
; GFX11_W64-NEXT: v_mov_b32_e32 v3, s7
; GFX11_W64-NEXT: v_div_fmas_f64 v[0:1], s[2:3], v[0:1], v[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 12cb8d2f6fb51..66d934b0170f4 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -2596,11 +2596,11 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v0
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1164_ITERATIVE-NEXT: v_add_co_ci_u32_e64 v1, null, s3, v1, vcc
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164_ITERATIVE-NEXT: s_endpgm
;
@@ -3143,15 +3143,15 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP: ; %bb.0: ; %entry
; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
@@ -5853,9 +5853,9 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1164-NEXT: v_subrev_co_ci_u32_e64 v6, null, 0, v8, vcc
; GFX1164-NEXT: v_mov_b32_e32 v0, v5
; GFX1164-NEXT: v_mov_b32_e32 v2, v7
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX1164-NEXT: v_mov_b32_e32 v1, v6
; GFX1164-NEXT: v_mov_b32_e32 v3, v8
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1164-NEXT: v_mov_b32_e32 v1, v6
; GFX1164-NEXT: buffer_atomic_cmpswap_b64 v[0:3], off, s[4:7], 0 glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: buffer_gl1_inv
@@ -5876,11 +5876,11 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v4
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v4
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0
+; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, v1, vcc
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164-NEXT: s_endpgm
;
@@ -6381,9 +6381,9 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: v_subrev_co_ci_u32_e64 v6, null, s15, v8, vcc
; GFX1164-NEXT: v_mov_b32_e32 v0, v5
; GFX1164-NEXT: v_mov_b32_e32 v2, v7
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX1164-NEXT: v_mov_b32_e32 v1, v6
; GFX1164-NEXT: v_mov_b32_e32 v3, v8
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1164-NEXT: v_mov_b32_e32 v1, v6
; GFX1164-NEXT: buffer_atomic_cmpswap_b64 v[0:3], off, s[4:7], 0 glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: buffer_gl1_inv
@@ -6981,9 +6981,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: v_subrev_co_ci_u32_e64 v7, null, s9, v9, vcc
; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, v6
; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, v8
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, v7
; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, v9
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, v7
; GFX1164_ITERATIVE-NEXT: buffer_atomic_cmpswap_b64 v[0:3], off, s[4:7], 0 glc
; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv
@@ -7003,10 +7003,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v4
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1164_ITERATIVE-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, v5, vcc
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164_ITERATIVE-NEXT: s_endpgm
;
@@ -7665,15 +7664,15 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP: ; %bb.0: ; %entry
; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
@@ -12767,10 +12766,10 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
; GFX1164-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-TRUE16-NEXT: v_add_f32_e32 v0, s10, v0
+; GFX1164-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX1164-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX1164-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX1164-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX1164-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
@@ -12825,10 +12824,10 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
; GFX1164-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-FAKE16-NEXT: v_add_f32_e32 v0, s10, v0
+; GFX1164-FAKE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX1164-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX1164-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX1164-FAKE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX1164-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -13812,22 +13811,23 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX1164-TRUE16-NEXT: v_add_f32_e32 v0, s11, v0
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-TRUE16-NEXT: v_add_f32_e32 v2, s10, v2
+; GFX1164-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX1164-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX1164-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX1164-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX1164-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX1164-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
; GFX1164-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
-; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1164-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GFX1164-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX1164-TRUE16-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
-; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0
; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0)
@@ -13872,22 +13872,23 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX1164-FAKE16-NEXT: v_add_f32_e32 v0, s12, v0
; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-FAKE16-NEXT: v_add_f32_e32 v2, s13, v2
+; GFX1164-FAKE16-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0
+; GFX1164-FAKE16-NEXT: s_waitcnt_depctr depctr_va_sdst(0)
+; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164-FAKE16-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX1164-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX1164-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX1164-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX1164-FAKE16-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX1164-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
-; GFX1164-FAKE16-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0
-; GFX1164-FAKE16-NEXT: s_waitcnt_depctr depctr_va_sdst(0)
-; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc
; GFX1164-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[0:1]
-; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc
; GFX1164-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0
; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 785aee07a990e..b71577385606a 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -2222,11 +2222,11 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v0
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1164_ITERATIVE-NEXT: v_add_co_ci_u32_e64 v1, null, s3, v1, vcc
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164_ITERATIVE-NEXT: s_endpgm
@@ -2629,57 +2629,56 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP: ; %bb.0: ; %entry
; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1164_DPP-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
@@ -3285,15 +3284,15 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1164_DPP: ; %bb.0: ; %entry
; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
@@ -5028,11 +5027,11 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0
+; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, v1, vcc
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164-NEXT: s_endpgm
@@ -5636,11 +5635,11 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v0
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1164_ITERATIVE-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, v1, vcc
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164_ITERATIVE-NEXT: s_endpgm
@@ -6043,57 +6042,56 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP: ; %bb.0: ; %entry
; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1164_DPP-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
@@ -11059,11 +11057,11 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB22_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
+; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
; GFX1164-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
@@ -11423,10 +11421,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv
; GFX1164_ITERATIVE-NEXT: .LBB23_4:
; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
@@ -12884,11 +12881,11 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB25_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
+; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
; GFX1164-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
@@ -13248,10 +13245,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv
; GFX1164_ITERATIVE-NEXT: .LBB26_4:
; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
@@ -14705,11 +14701,11 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB28_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
+; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
; GFX1164-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
@@ -15063,10 +15059,9 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv
; GFX1164_ITERATIVE-NEXT: .LBB29_4:
; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
@@ -16516,11 +16511,11 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB31_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
+; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
; GFX1164-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
@@ -16874,10 +16869,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv
; GFX1164_ITERATIVE-NEXT: .LBB32_4:
; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/gfx11-sgpr-hazard-latency.mir b/llvm/test/CodeGen/AMDGPU/gfx11-sgpr-hazard-latency.mir
new file mode 100644
index 0000000000000..1d071e7395e4b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/gfx11-sgpr-hazard-latency.mir
@@ -0,0 +1,169 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -start-before machine-scheduler -stop-after amdgpu-wait-sgpr-hazards -o - %s | FileCheck -check-prefix=GFX11 %s
+
+# The following loop should only require a single s_waitcnt_depctr
+---
+name: gemm_loop1
+tracksRegLiveness: true
+body: |
+ ; GFX11-LABEL: name: gemm_loop1
+ ; GFX11: bb.0:
+ ; GFX11-NEXT: successors: %bb.1(0x80000000)
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: S_WAITCNT 0
+ ; GFX11-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX11-NEXT: renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX11-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX11-NEXT: renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX11-NEXT: renamable $vgpr4 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX11-NEXT: renamable $sgpr8 = S_MOV_B32 0
+ ; GFX11-NEXT: renamable $sgpr9 = S_MOV_B32 0
+ ; GFX11-NEXT: renamable $sgpr10 = S_MOV_B32 0
+ ; GFX11-NEXT: renamable $sgpr11 = S_MOV_B32 0
+ ; GFX11-NEXT: renamable $sgpr12 = S_MOV_B32 0
+ ; GFX11-NEXT: renamable $sgpr13 = S_MOV_B32 0
+ ; GFX11-NEXT: renamable $sgpr14 = S_MOV_B32 0
+ ; GFX11-NEXT: renamable $sgpr15 = S_MOV_B32 0
+ ; GFX11-NEXT: renamable $sgpr16 = S_MOV_B32 0
+ ; GFX11-NEXT: renamable $sgpr17 = S_MOV_B32 0
+ ; GFX11-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+ ; GFX11-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = IMPLICIT_DEF
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: bb.1:
+ ; GFX11-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX11-NEXT: liveins: $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4_sgpr5_sgpr6_sgpr7
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: renamable $vgpr5 = V_ADD_U32_e32 $sgpr17, $vgpr0, implicit $exec
+ ; GFX11-NEXT: renamable $sgpr17 = S_ADDK_I32 killed renamable $sgpr17, 128, implicit-def dead $scc
+ ; GFX11-NEXT: S_CMP_LT_U32 renamable $sgpr17, renamable $sgpr15, implicit-def $scc
+ ; GFX11-NEXT: renamable $sgpr17 = S_CSELECT_B32 killed renamable $sgpr17, 0, implicit killed $scc
+ ; GFX11-NEXT: renamable $vgpr7 = V_ADD_U32_e32 64, $vgpr5, implicit $exec
+ ; GFX11-NEXT: renamable $sgpr18_sgpr19 = V_CMP_GT_U32_e64 $sgpr15, $vgpr5, implicit $exec
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 61855
+ ; GFX11-NEXT: renamable $vgpr6 = V_ADD_U32_e32 $sgpr8, $vgpr5, implicit $exec
+ ; GFX11-NEXT: renamable $vgpr8 = V_ADD_U32_e32 $sgpr9, killed $vgpr5, implicit $exec
+ ; GFX11-NEXT: renamable $sgpr16 = nsw S_ADD_I32 killed renamable $sgpr16, -1, implicit-def dead $scc
+ ; GFX11-NEXT: renamable $sgpr20_sgpr21 = V_CMP_GT_U32_e64 $sgpr15, $vgpr7, implicit $exec
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 61855
+ ; GFX11-NEXT: S_CMP_LG_U32 renamable $sgpr16, 0, implicit-def $scc
+ ; GFX11-NEXT: renamable $vgpr5 = V_CNDMASK_B32_e64 0, -1, 0, killed $vgpr6, $sgpr18_sgpr19, implicit $exec
+ ; GFX11-NEXT: renamable $vgpr6 = V_ADD_U32_e32 $sgpr8, $vgpr7, implicit $exec
+ ; GFX11-NEXT: renamable $vgpr7 = V_ADD_U32_e32 $sgpr9, killed $vgpr7, implicit $exec
+ ; GFX11-NEXT: renamable $vgpr8 = V_CNDMASK_B32_e64 0, $sgpr10, 0, killed $vgpr8, killed $sgpr18_sgpr19, implicit $exec
+ ; GFX11-NEXT: renamable $vgpr6 = V_CNDMASK_B32_e64 0, -1, 0, killed $vgpr6, $sgpr20_sgpr21, implicit $exec
+ ; GFX11-NEXT: renamable $vgpr7 = V_CNDMASK_B32_e64 0, $sgpr10, 0, killed $vgpr7, killed $sgpr20_sgpr21, implicit $exec
+ ; GFX11-NEXT: renamable $vgpr9 = V_ADD_U32_e32 $sgpr11, $vgpr8, implicit $exec
+ ; GFX11-NEXT: renamable $vgpr10 = V_ADD_U32_e32 $sgpr12, $vgpr8, implicit $exec
+ ; GFX11-NEXT: renamable $vgpr11 = V_ADD_U32_e32 $sgpr13, $vgpr8, implicit $exec
+ ; GFX11-NEXT: renamable $vgpr8 = V_ADD_U32_e32 $sgpr14, killed $vgpr8, implicit $exec
+ ; GFX11-NEXT: renamable $vgpr12 = V_ADD_U32_e32 $sgpr11, $vgpr7, implicit $exec
+ ; GFX11-NEXT: renamable $vgpr13 = V_ADD_U32_e32 $sgpr12, $vgpr7, implicit $exec
+ ; GFX11-NEXT: renamable $vgpr14 = V_ADD_U32_e32 $sgpr13, $vgpr7, implicit $exec
+ ; GFX11-NEXT: renamable $vgpr7 = V_ADD_U32_e32 $sgpr14, killed $vgpr7, implicit $exec
+ ; GFX11-NEXT: BUNDLE implicit-def $vgpr5, implicit-def $vgpr6, implicit killed $vgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec, implicit killed $vgpr6 :: (dereferenceable invariant load (s16), align 1, addrspace 8) {
+ ; GFX11-NEXT: S_CLAUSE 1
+ ; GFX11-NEXT: renamable $vgpr5 = BUFFER_LOAD_USHORT_IDXEN killed renamable $vgpr5, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s16), align 1, addrspace 8)
+ ; GFX11-NEXT: renamable $vgpr6 = BUFFER_LOAD_USHORT_IDXEN killed renamable $vgpr6, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s16), align 1, addrspace 8)
+ ; GFX11-NEXT: }
+ ; GFX11-NEXT: BUNDLE implicit-def $vgpr9, implicit-def $vgpr10, implicit-def $vgpr11, implicit-def $vgpr8, implicit-def $vgpr12, implicit-def $vgpr13, implicit-def $vgpr14, implicit-def $vgpr7, implicit killed $vgpr9, implicit $sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec, implicit killed $vgpr10, implicit killed $vgpr11, implicit killed $vgpr8, implicit killed $vgpr12, implicit killed $vgpr13, implicit killed $vgpr14, implicit killed $vgpr7 :: (dereferenceable invariant load (s16), align 1, addrspace 8) {
+ ; GFX11-NEXT: S_CLAUSE 7
+ ; GFX11-NEXT: renamable $vgpr9 = BUFFER_LOAD_USHORT_IDXEN killed renamable $vgpr9, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s16), align 1, addrspace 8)
+ ; GFX11-NEXT: renamable $vgpr10 = BUFFER_LOAD_USHORT_IDXEN killed renamable $vgpr10, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s16), align 1, addrspace 8)
+ ; GFX11-NEXT: renamable $vgpr11 = BUFFER_LOAD_USHORT_IDXEN killed renamable $vgpr11, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s16), align 1, addrspace 8)
+ ; GFX11-NEXT: renamable $vgpr8 = BUFFER_LOAD_USHORT_IDXEN killed renamable $vgpr8, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s16), align 1, addrspace 8)
+ ; GFX11-NEXT: renamable $vgpr12 = BUFFER_LOAD_USHORT_IDXEN killed renamable $vgpr12, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s16), align 1, addrspace 8)
+ ; GFX11-NEXT: renamable $vgpr13 = BUFFER_LOAD_USHORT_IDXEN killed renamable $vgpr13, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s16), align 1, addrspace 8)
+ ; GFX11-NEXT: renamable $vgpr14 = BUFFER_LOAD_USHORT_IDXEN killed renamable $vgpr14, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s16), align 1, addrspace 8)
+ ; GFX11-NEXT: renamable $vgpr7 = BUFFER_LOAD_USHORT_IDXEN killed renamable $vgpr7, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s16), align 1, addrspace 8)
+ ; GFX11-NEXT: }
+ ; GFX11-NEXT: S_WAITCNT 9207
+ ; GFX11-NEXT: renamable $vgpr5 = V_PERM_B32_e64 killed $vgpr6, killed $vgpr5, 84148480, implicit $exec
+ ; GFX11-NEXT: S_WAITCNT 4087
+ ; GFX11-NEXT: renamable $vgpr6 = V_PERM_B32_e64 killed $vgpr12, killed $vgpr9, 84148480, implicit $exec
+ ; GFX11-NEXT: S_WAITCNT 3063
+ ; GFX11-NEXT: renamable $vgpr9 = V_PERM_B32_e64 killed $vgpr13, killed $vgpr10, 84148480, implicit $exec
+ ; GFX11-NEXT: S_WAITCNT 2039
+ ; GFX11-NEXT: renamable $vgpr10 = V_PERM_B32_e64 killed $vgpr14, killed $vgpr11, 84148480, implicit $exec
+ ; GFX11-NEXT: S_WAITCNT 1015
+ ; GFX11-NEXT: renamable $vgpr7 = V_PERM_B32_e64 killed $vgpr7, killed $vgpr8, 84148480, implicit $exec
+ ; GFX11-NEXT: renamable $vgpr1 = nofpexcept V_DOT2_F32_F16 8, $vgpr5, 8, killed $vgpr6, 8, killed $vgpr1, -1, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: renamable $vgpr2 = nofpexcept V_DOT2_F32_F16 8, $vgpr5, 8, killed $vgpr9, 8, killed $vgpr2, -1, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: renamable $vgpr3 = nofpexcept V_DOT2_F32_F16 8, $vgpr5, 8, killed $vgpr10, 8, killed $vgpr3, -1, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: renamable $vgpr4 = nofpexcept V_DOT2_F32_F16 8, killed $vgpr5, 8, killed $vgpr7, 8, killed $vgpr4, -1, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: bb.2:
+ ; GFX11-NEXT: S_ENDPGM 0
+ bb.0:
+ %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1:sreg_32 = S_MOV_B32 0
+ %2:sreg_32 = S_MOV_B32 0
+ %3:sreg_32 = S_MOV_B32 0
+ %4:sgpr_128 = IMPLICIT_DEF
+ %5:sreg_32 = S_MOV_B32 0
+ %6:sgpr_128 = IMPLICIT_DEF
+ %7:sreg_32 = S_MOV_B32 0
+ %8:sreg_32 = S_MOV_B32 0
+ %9:sreg_32 = S_MOV_B32 0
+ %10:sgpr_32 = S_MOV_B32 0
+ %11:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %12:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %13:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %14:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %15:sreg_32 = S_MOV_B32 0
+ %16:sreg_32 = S_MOV_B32 0
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+
+ %17:vgpr_32 = V_ADD_U32_e64 %16, %0, 0, implicit $exec
+ %18:vgpr_32 = V_ADD_U32_e64 %1, %17, 0, implicit $exec
+ %19:sreg_64_xexec = V_CMP_GT_U32_e64 %10, %17, implicit $exec
+ %20:vgpr_32 = V_CNDMASK_B32_e64 0, -1, 0, %18, %19, implicit $exec
+ %21:vgpr_32 = V_ADD_U32_e64 %2, %17, 0, implicit $exec
+ %22:vgpr_32 = V_CNDMASK_B32_e64 0, %3, 0, %21, %19, implicit $exec
+ %23:vgpr_32 = BUFFER_LOAD_USHORT_IDXEN %20, %4, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s16), align 1, addrspace 8)
+ %24:vgpr_32 = V_ADD_U32_e64 %5, %22, 0, implicit $exec
+ %25:vgpr_32 = BUFFER_LOAD_USHORT_IDXEN %24, %6, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s16), align 1, addrspace 8)
+ %26:vgpr_32 = V_ADD_U32_e64 %7, %22, 0, implicit $exec
+ %27:vgpr_32 = BUFFER_LOAD_USHORT_IDXEN %26, %6, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s16), align 1, addrspace 8)
+ %28:vgpr_32 = V_ADD_U32_e64 %8, %22, 0, implicit $exec
+ %29:vgpr_32 = BUFFER_LOAD_USHORT_IDXEN %28, %6, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s16), align 1, addrspace 8)
+ %30:vgpr_32 = V_ADD_U32_e64 %9, %22, 0, implicit $exec
+ %31:vgpr_32 = BUFFER_LOAD_USHORT_IDXEN %30, %6, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s16), align 1, addrspace 8)
+ %32:vgpr_32 = V_ADD_U32_e64 64, %17, 0, implicit $exec
+ %33:vgpr_32 = V_ADD_U32_e64 %1, %32, 0, implicit $exec
+ %34:sreg_64_xexec = V_CMP_GT_U32_e64 %10, %32, implicit $exec
+ %35:vgpr_32 = V_CNDMASK_B32_e64 0, -1, 0, %33, %34, implicit $exec
+ %36:vgpr_32 = V_ADD_U32_e64 %2, %32, 0, implicit $exec
+ %37:vgpr_32 = V_CNDMASK_B32_e64 0, %3, 0, %36, %34, implicit $exec
+ %38:vgpr_32 = BUFFER_LOAD_USHORT_IDXEN %35, %4, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s16), align 1, addrspace 8)
+ %39:vgpr_32 = V_ADD_U32_e64 %5, %37, 0, implicit $exec
+ %40:vgpr_32 = BUFFER_LOAD_USHORT_IDXEN %39, %6, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s16), align 1, addrspace 8)
+ %41:vgpr_32 = V_ADD_U32_e64 %7, %37, 0, implicit $exec
+ %42:vgpr_32 = BUFFER_LOAD_USHORT_IDXEN %41, %6, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s16), align 1, addrspace 8)
+ %43:vgpr_32 = V_ADD_U32_e64 %8, %37, 0, implicit $exec
+ %44:vgpr_32 = BUFFER_LOAD_USHORT_IDXEN %43, %6, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s16), align 1, addrspace 8)
+ %45:vgpr_32 = V_ADD_U32_e64 %9, %37, 0, implicit $exec
+ %46:vgpr_32 = BUFFER_LOAD_USHORT_IDXEN %45, %6, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s16), align 1, addrspace 8)
+ %47:vgpr_32 = V_PERM_B32_e64 %38, %23, 84148480, implicit $exec
+ %48:vgpr_32 = V_PERM_B32_e64 %40, %25, 84148480, implicit $exec
+ %11:vgpr_32 = nofpexcept V_DOT2_F32_F16 8, %47, 8, %48, 8, %11, -1, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %49:vgpr_32 = V_PERM_B32_e64 %42, %27, 84148480, implicit $exec
+ %12:vgpr_32 = nofpexcept V_DOT2_F32_F16 8, %47, 8, %49, 8, %12, -1, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %50:vgpr_32 = V_PERM_B32_e64 %44, %29, 84148480, implicit $exec
+ %13:vgpr_32 = nofpexcept V_DOT2_F32_F16 8, %47, 8, %50, 8, %13, -1, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %51:vgpr_32 = V_PERM_B32_e64 %46, %31, 84148480, implicit $exec
+ %14:vgpr_32 = nofpexcept V_DOT2_F32_F16 8, %47, 8, %51, 8, %14, -1, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %52:sreg_32 = S_ADD_I32 %16, 128, implicit-def dead $scc
+ S_CMP_LT_U32 %52, %10, implicit-def $scc
+ %16:sreg_32 = S_CSELECT_B32 %52, 0, implicit killed $scc
+ %15:sreg_32 = nsw S_ADD_I32 %15, -1, implicit-def dead $scc
+ S_CMP_LG_U32 %15, 0, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index 890ebddf36801..2e4adad77dcc8 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -1125,10 +1125,9 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4096(ptr addrspace
; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc, 0x1000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1]
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1212,10 +1211,9 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4097(ptr addrsp
; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc, 0xfffff000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1]
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc
; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-1
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1393,10 +1391,9 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_0x7FFFFF(ptr addrs
; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_0x7FFFFF:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc, 0x7ff000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1]
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc
; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4095
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1441,10 +1438,9 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_0xFFFFFF(ptr addrs
; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_0xFFFFFF:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc, 0xff800000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1]
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
>From 873b21ce79e56433da2bcb7caf00cdcb0eb43817 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Mon, 8 Dec 2025 17:15:04 +0900
Subject: [PATCH 2/2] - Use references
---
llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.cpp | 18 ++++++++----------
1 file changed, 8 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.cpp
index 2257154d68543..637f56bc8c5b8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.cpp
@@ -28,23 +28,21 @@ namespace {
class HazardLatency : public ScheduleDAGMutation {
private:
- const GCNSubtarget *ST;
- const SIRegisterInfo *TRI;
- const MachineRegisterInfo *MRI;
+ const GCNSubtarget &ST;
+ const SIRegisterInfo &TRI;
+ const MachineRegisterInfo &MRI;
public:
- HazardLatency(MachineFunction *MF) {
- ST = &MF->getSubtarget<GCNSubtarget>();
- TRI = ST->getRegisterInfo();
- MRI = &MF->getRegInfo();
- }
+ HazardLatency(MachineFunction *MF)
+ : ST(MF->getSubtarget<GCNSubtarget>()), TRI(*ST.getRegisterInfo()),
+ MRI(MF->getRegInfo()) {}
void apply(ScheduleDAGInstrs *DAG) override;
};
void HazardLatency::apply(ScheduleDAGInstrs *DAG) {
constexpr unsigned MaskLatencyBoost = 3;
- if (!ST->hasVALUMaskWriteHazard() || !ST->isWave64())
+ if (!ST.hasVALUMaskWriteHazard() || !ST.isWave64())
return;
for (SUnit &SU : DAG->SUnits) {
@@ -63,7 +61,7 @@ void HazardLatency::apply(ScheduleDAGInstrs *DAG) {
if (!SIInstrInfo::isVALU(*DestMI))
continue;
Register Reg = SuccDep.getReg();
- if (!TRI->isSGPRReg(*MRI, Reg))
+ if (!TRI.isSGPRReg(MRI, Reg))
continue;
SuccDep.setLatency(SuccDep.getLatency() * MaskLatencyBoost);
}
More information about the llvm-commits
mailing list