[llvm] [AMDGPU] Detect VALU-after-MFMA hazard in post-RA scheduler's getHazardType (PR #184084)

Anshil Gandhi via llvm-commits llvm-commits at lists.llvm.org
Mon Mar 2 02:27:58 PST 2026


https://github.com/gandhi56 updated https://github.com/llvm/llvm-project/pull/184084

>From eec5eaf4fc49c33b656dc701be7f52e62284af07 Mon Sep 17 00:00:00 2001
From: Anshil Gandhi <Anshil.Gandhi at amd.com>
Date: Sun, 1 Mar 2026 21:37:14 -0600
Subject: [PATCH 1/2] [AMDGPU] Detect VALU-after-MFMA hazard in post-RA
 scheduler's getHazardType

Add a check for the VALU-immediately-after-MFMA hazard in getHazardType so the
post-RA scheduler can avoid scheduling a non-MFMA VALU instruction immediately
after an MFMA with no intervening stall cycle.

Previously this hazard was only handled in PreEmitNoopsCommon (used by the
PostRAHazardRecognizer pass). The scheduler path did not consider it, so the
scheduler could place a VALU right after an MFMA and rely on later noop
insertion. By checking in getHazardType, the scheduler can prefer other
instructions or add stall cycles instead.

Introduce checkVALUImmediatelyAfterMFMAHazard to centralize the logic. It walks
EmittedInstrs in emission order and returns 1 if an MFMA was the last emitted
instruction with no stall cycle in between, otherwise 0.
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 28 +++++++
 llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h  |  1 +
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir | 84 +++++++++----------
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll |  2 +-
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir | 24 +++---
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 20 ++---
 ...vm.amdgcn.sched.group.barrier.iterative.ll | 26 +++---
 .../AMDGPU/llvm.amdgcn.sched.group.barrier.ll | 36 ++++----
 .../AMDGPU/rewrite-vgpr-mfma-to-agpr.ll       | 11 +--
 9 files changed, 131 insertions(+), 101 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 035e874834166..993bd65a08daa 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -234,6 +234,10 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
       checkMAIVALUHazards(MI) > 0)
     return HazardType;
 
+  if (SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMFMA(*MI) &&
+      checkVALUImmediatelyAfterMFMAHazard(MI) > 0)
+    return HazardType;
+
   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
     return HazardType;
 
@@ -2467,6 +2471,30 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
   return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
 }
 
+int GCNHazardRecognizer::checkVALUImmediatelyAfterMFMAHazard(MachineInstr *MI) {
+  if (!ST.hasMAIInsts())
+    return 0;
+
+  assert(SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMFMA(*MI));
+
+  // Find the most recently emitted instruction. Count leading nullptrs (stall
+  // cycles); one or more means we've had at least one cycle of separation.
+  int LeadingNullptrs = 0;
+  for (MachineInstr *EmittedMI : EmittedInstrs) {
+    if (!EmittedMI) {
+      ++LeadingNullptrs;
+      continue;
+    }
+    // Found the most recent instruction.
+    if (LeadingNullptrs >= 1)
+      return 0; // At least one stall cycle since MFMA, separation achieved
+    if (SIInstrInfo::isMFMA(*EmittedMI))
+      return 1; // MFMA is most recent with no intervening cycle, block VALU
+    return 0;   // Most recent is not MFMA
+  }
+  return 0; // Empty, no preceding MFMA
+}
+
 int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
   // Early exit if no padding is requested.
   if (MFMAPaddingRatio == 0)
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index b331504d40113..1d1ef156c5e5d 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -160,6 +160,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   int checkMAIVALUHazards(MachineInstr *MI);
   int checkMAILdStHazards(MachineInstr *MI);
   int checkPermlaneHazards(MachineInstr *MI);
+  int checkVALUImmediatelyAfterMFMAHazard(MachineInstr *MI);
 
 public:
   GCNHazardRecognizer(const MachineFunction &MF,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
index 94de6dd31cad5..81a45ee47c4db 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
@@ -240,8 +240,8 @@
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_perm_b32 v239, v174, v172, s5
   ; GCN-NEXT:    v_perm_b32 v241, v174, v172, s7
-  ; GCN-NEXT:    v_perm_b32 v243, v175, v173, s5
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[214:215], v[144:145], v[64:79]
+  ; GCN-NEXT:    v_perm_b32 v243, v175, v173, s5
   ; GCN-NEXT:    v_perm_b32 v245, v175, v173, s7
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[176:177], v[156:157], v[96:111]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[220:221], v[142:143], v[112:127]
@@ -362,8 +362,8 @@
   ; GCN-NEXT:    v_perm_b32 v192, v195, v193, s5
   ; GCN-NEXT:    v_perm_b32 v194, v195, v193, s7
   ; GCN-NEXT:    v_perm_b32 v193, v221, v219, s5
-  ; GCN-NEXT:    v_perm_b32 v195, v221, v219, s7
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[166:167], v[146:147], v[80:95]
+  ; GCN-NEXT:    v_perm_b32 v195, v221, v219, s7
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[168:169], v[140:141], v[80:95]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[170:171], v[142:143], v[80:95]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[172:173], v[148:149], v[80:95]
@@ -387,61 +387,61 @@
   ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v114
   ; GCN-NEXT:    v_mul_f32_e32 v219, s4, v115
   ; GCN-NEXT:    v_max3_f32 v213, v213, v218, v219
-  ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v116
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[182:183], v[134:135], v[80:95]
+  ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v116
   ; GCN-NEXT:    v_mul_f32_e32 v219, s4, v117
   ; GCN-NEXT:    v_max3_f32 v213, v213, v218, v219
   ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v118
   ; GCN-NEXT:    v_mul_f32_e32 v219, s4, v119
   ; GCN-NEXT:    v_max3_f32 v213, v213, v218, v219
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[152:153], v[132:133], v[64:79]
   ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v120
   ; GCN-NEXT:    v_mul_f32_e32 v219, s4, v121
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[152:153], v[132:133], v[64:79]
   ; GCN-NEXT:    v_max3_f32 v213, v213, v218, v219
   ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v122
   ; GCN-NEXT:    v_mul_f32_e32 v219, s4, v123
   ; GCN-NEXT:    v_max3_f32 v213, v213, v218, v219
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[236:237], v[130:131], v[96:111]
   ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v124
   ; GCN-NEXT:    v_mul_f32_e32 v219, s4, v125
   ; GCN-NEXT:    v_max3_f32 v213, v213, v218, v219
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[236:237], v[130:131], v[96:111]
   ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v126
   ; GCN-NEXT:    v_mul_f32_e32 v219, s4, v127
   ; GCN-NEXT:    v_max3_f32 v213, v213, v218, v219
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[184:185], v[128:129], v[80:95]
-  ; GCN-NEXT:    s_nop 6
+  ; GCN-NEXT:    s_nop 3
   ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v96
   ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v97
   ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
   ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v98
   ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v99
   ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
-  ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v100
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[154:155], v[134:135], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v100
   ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v101
   ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
   ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v102
   ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v103
   ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[186:187], v[130:131], v[80:95]
   ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v104
   ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v105
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[186:187], v[130:131], v[80:95]
   ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
   ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v106
   ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v107
   ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[156:157], v[128:129], v[64:79]
   ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v108
   ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v109
   ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[156:157], v[128:129], v[64:79]
   ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v110
   ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v111
   ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
   ; GCN-NEXT:    v_mul_f32_e32 v140, s4, v80
   ; GCN-NEXT:    v_mul_f32_e32 v141, s4, v81
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[158:159], v[130:131], v[64:79]
   ; GCN-NEXT:    v_max3_f32 v140, v213, v140, v141
   ; GCN-NEXT:    v_mul_f32_e32 v141, s4, v82
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[158:159], v[130:131], v[64:79]
   ; GCN-NEXT:    v_mul_f32_e32 v142, s4, v83
   ; GCN-NEXT:    v_max3_f32 v140, v140, v141, v142
   ; GCN-NEXT:    v_mul_f32_e32 v141, s4, v84
@@ -602,23 +602,23 @@
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v121
   ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v125
   ; GCN-NEXT:    v_fma_f32 v139, s4, v96, -v128
-  ; GCN-NEXT:    v_fma_f32 v127, s4, v127, -v128
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[146:147], v[48:63]
   ; GCN-NEXT:    v_exp_f32_e32 v123, v150
+  ; GCN-NEXT:    v_fma_f32 v127, s4, v127, -v128
   ; GCN-NEXT:    v_mul_f32_e32 v127, 0x3fb8aa3b, v127
   ; GCN-NEXT:    v_fma_f32 v143, s4, v101, -v128
   ; GCN-NEXT:    v_fma_f32 v64, s4, v64, -v128
   ; GCN-NEXT:    v_fma_f32 v65, s4, v65, -v128
-  ; GCN-NEXT:    v_fma_f32 v68, s4, v68, -v128
-  ; GCN-NEXT:    v_fma_f32 v69, s4, v69, -v128
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[134:135], v[0:15]
-  ; GCN-NEXT:    v_exp_f32_e32 v124, v151
   ; GCN-NEXT:    ds_read_b128 v[130:133], v197
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_exp_f32_e32 v124, v151
   ; GCN-NEXT:    ds_read_b128 v[146:149], v197 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_fma_f32 v68, s4, v68, -v128
+  ; GCN-NEXT:    v_fma_f32 v69, s4, v69, -v128
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47]
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v136, v122
   ; GCN-NEXT:    v_exp_f32_e32 v96, v129
@@ -645,14 +645,14 @@
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[126:127], v[0:15]
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    v_fma_f32 v131, s4, v100, -v128
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v130, v96
   ; GCN-NEXT:    v_exp_f32_e32 v100, v129
   ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v131
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v131, v97
-  ; GCN-NEXT:    ;;#ASMSTART
-  ; GCN-NEXT:    s_waitcnt vmcnt(8)
-  ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    ds_write_b64 v199, v[188:189]
@@ -687,10 +687,10 @@
   ; GCN-NEXT:    buffer_load_dwordx2 v[144:145], v209, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[126:127], v[48:63]
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[126:127], v[48:63]
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v99
   ; GCN-NEXT:    v_fma_f32 v127, s4, v103, -v128
   ; GCN-NEXT:    v_exp_f32_e32 v103, v150
@@ -710,30 +710,30 @@
   ; GCN-NEXT:    ds_read_b128 v[130:133], v198
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_perm_b32 v127, v144, v142, s5
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[146:147], v[32:47]
   ; GCN-NEXT:    v_pack_b32_f16 v148, v134, v135
   ; GCN-NEXT:    v_fma_f32 v135, s4, v106, -v128
   ; GCN-NEXT:    v_exp_f32_e32 v105, v125
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v134, v102
+  ; GCN-NEXT:    v_perm_b32 v127, v144, v142, s5
   ; GCN-NEXT:    v_perm_b32 v151, v144, v142, s7
-  ; GCN-NEXT:    v_perm_b32 v153, v145, v143, s5
-  ; GCN-NEXT:    v_perm_b32 v155, v145, v143, s7
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[146:147], v[16:31]
   ; GCN-NEXT:    v_exp_f32_e32 v106, v156
   ; GCN-NEXT:    v_mul_f32_e32 v156, 0x3fb8aa3b, v135
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v135, v103
   ; GCN-NEXT:    v_fma_f32 v136, s4, v107, -v128
+  ; GCN-NEXT:    v_perm_b32 v153, v145, v143, s5
+  ; GCN-NEXT:    v_perm_b32 v155, v145, v143, s7
   ; GCN-NEXT:    ds_read_b128 v[142:145], v198 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v139
-  ; GCN-NEXT:    v_pack_b32_f16 v149, v134, v135
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[146:147], v[48:63]
   ; GCN-NEXT:    v_mul_f32_e32 v146, 0x3fb8aa3b, v136
+  ; GCN-NEXT:    v_pack_b32_f16 v149, v134, v135
   ; GCN-NEXT:    ds_read_b128 v[134:137], v198 offset:1152
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v139
   ; GCN-NEXT:    v_exp_f32_e32 v107, v138
   ; GCN-NEXT:    ds_read_b128 v[138:141], v198 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -763,10 +763,10 @@
   ; GCN-NEXT:    v_fma_f32 v139, s4, v80, -v128
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v138, v108
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15]
-  ; GCN-NEXT:    v_exp_f32_e32 v80, v129
   ; GCN-NEXT:    ds_read_b128 v[130:133], v197
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_exp_f32_e32 v80, v129
   ; GCN-NEXT:    ds_read_b128 v[146:149], v197 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
@@ -784,9 +784,9 @@
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v134, v111
   ; GCN-NEXT:    v_mul_f32_e32 v156, 0x3fb8aa3b, v137
   ; GCN-NEXT:    v_fma_f32 v137, s4, v83, -v128
-  ; GCN-NEXT:    v_mul_f32_e32 v157, 0x3fb8aa3b, v137
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[142:143], v[48:63]
   ; GCN-NEXT:    v_exp_f32_e32 v83, v135
+  ; GCN-NEXT:    v_mul_f32_e32 v157, 0x3fb8aa3b, v137
   ; GCN-NEXT:    v_pack_b32_f16 v145, v136, v134
   ; GCN-NEXT:    ds_read_b128 v[134:137], v197 offset:1152
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -797,13 +797,13 @@
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[144:145], v[0:15]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    ds_write_b64 v199, v[126:127]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b64 v200, v[150:151]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[144:145], v[0:15]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b64 v201, v[152:153]
@@ -813,8 +813,6 @@
   ; GCN-NEXT:    v_fma_f32 v127, s4, v84, -v128
   ; GCN-NEXT:    v_exp_f32_e32 v84, v129
   ; GCN-NEXT:    v_fma_f32 v130, s4, v85, -v128
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v80
-  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v127
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[144:145], v[32:47]
   ; GCN-NEXT:    v_exp_f32_e32 v85, v125
   ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v130
@@ -822,8 +820,9 @@
   ; GCN-NEXT:    buffer_load_dwordx2 v[130:131], v206, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v80
+  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v127
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v127, v81
-  ; GCN-NEXT:    v_pack_b32_f16 v126, v126, v127
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[144:145], v[16:31]
   ; GCN-NEXT:    v_fma_f32 v134, s4, v86, -v128
   ; GCN-NEXT:    v_mul_f32_e32 v158, 0x3fb8aa3b, v134
@@ -836,24 +835,25 @@
   ; GCN-NEXT:    buffer_load_dwordx2 v[146:147], v205, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_pack_b32_f16 v126, v126, v127
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v127, v82
   ; GCN-NEXT:    v_exp_f32_e32 v86, v156
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[144:145], v[48:63]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v138, v83
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v138, v83
   ; GCN-NEXT:    v_fma_f32 v139, s4, v87, -v128
   ; GCN-NEXT:    v_exp_f32_e32 v87, v157
+  ; GCN-NEXT:    v_mul_f32_e32 v139, 0x3fb8aa3b, v139
   ; GCN-NEXT:    v_pack_b32_f16 v127, v127, v138
   ; GCN-NEXT:    v_fma_f32 v138, s4, v89, -v128
-  ; GCN-NEXT:    v_mul_f32_e32 v139, 0x3fb8aa3b, v139
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[126:127], v[0:15]
   ; GCN-NEXT:    ; implicit-def: $sgpr0
   ; GCN-NEXT:    v_perm_b32 v154, v135, v131, s5
   ; GCN-NEXT:    v_perm_b32 v156, v135, v131, s7
   ; GCN-NEXT:    v_fma_f32 v135, s4, v88, -v128
   ; GCN-NEXT:    v_perm_b32 v150, v134, v130, s5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[126:127], v[0:15]
   ; GCN-NEXT:    v_perm_b32 v152, v134, v130, s7
   ; GCN-NEXT:    ds_read_b128 v[130:133], v198
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -898,17 +898,17 @@
   ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v130
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v130, v89
   ; GCN-NEXT:    v_fma_f32 v131, s4, v93, -v128
-  ; GCN-NEXT:    v_pack_b32_f16 v130, v126, v130
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[146:147], v[32:47]
   ; GCN-NEXT:    v_exp_f32_e32 v93, v125
+  ; GCN-NEXT:    v_pack_b32_f16 v130, v126, v130
   ; GCN-NEXT:    v_fma_f32 v126, s4, v94, -v128
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v125, v90
   ; GCN-NEXT:    v_mul_f32_e32 v143, 0x3fb8aa3b, v126
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v91
   ; GCN-NEXT:    v_mul_f32_e32 v142, 0x3fb8aa3b, v131
-  ; GCN-NEXT:    v_fma_f32 v131, s4, v95, -v128
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[146:147], v[16:31]
   ; GCN-NEXT:    v_exp_f32_e32 v94, v148
+  ; GCN-NEXT:    v_fma_f32 v131, s4, v95, -v128
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v93
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[146:147], v[48:63]
   ; GCN-NEXT:    v_exp_f32_e32 v95, v127
@@ -917,10 +917,10 @@
   ; GCN-NEXT:    v_pack_b32_f16 v131, v125, v126
   ; GCN-NEXT:    s_nop 1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[130:131], v[0:15]
-  ; GCN-NEXT:    v_exp_f32_e32 v125, v129
   ; GCN-NEXT:    ds_read_b128 v[132:135], v197
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_exp_f32_e32 v125, v129
   ; GCN-NEXT:    ds_read_b128 v[146:149], v197 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
@@ -951,10 +951,10 @@
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    ds_write_b64 v199, v[150:151]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b64 v200, v[152:153]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15]
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v132, v125
   ; GCN-NEXT:    v_exp_f32_e32 v130, v158
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
@@ -973,14 +973,13 @@
   ; GCN-NEXT:    v_mul_f32_e32 v144, 0x3fb8aa3b, v69
   ; GCN-NEXT:    v_fma_f32 v69, s4, v71, -v128
   ; GCN-NEXT:    v_pack_b32_f16 v140, v132, v68
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v129
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[64:65], v[142:143], v[16:31]
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v127
   ; GCN-NEXT:    v_exp_f32_e32 v132, v145
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v129
   ; GCN-NEXT:    v_fma_f32 v65, s4, v70, -v128
   ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v65
   ; GCN-NEXT:    v_fma_f32 v145, s4, v73, -v128
-  ; GCN-NEXT:    v_mul_f32_e32 v147, 0x3fb8aa3b, v145
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[136:137], v[142:143], v[48:63]
   ; GCN-NEXT:    v_exp_f32_e32 v133, v141
   ; GCN-NEXT:    v_mul_f32_e32 v142, 0x3fb8aa3b, v69
@@ -992,12 +991,13 @@
   ; GCN-NEXT:    v_fma_f32 v143, s4, v72, -v128
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v130
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[134:135], v[140:141], v[0:15]
-  ; GCN-NEXT:    v_exp_f32_e32 v72, v146
-  ; GCN-NEXT:    v_mul_f32_e32 v146, 0x3fb8aa3b, v143
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v143, v131
   ; GCN-NEXT:    ds_read_b128 v[134:137], v198 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_exp_f32_e32 v72, v146
+  ; GCN-NEXT:    v_mul_f32_e32 v146, 0x3fb8aa3b, v143
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v143, v131
+  ; GCN-NEXT:    v_mul_f32_e32 v147, 0x3fb8aa3b, v145
   ; GCN-NEXT:    v_pack_b32_f16 v64, v64, v143
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[140:141], v[32:47]
   ; GCN-NEXT:    v_exp_f32_e32 v73, v144
@@ -1043,10 +1043,10 @@
   ; GCN-NEXT:    v_pack_b32_f16 v135, v66, v64
   ; GCN-NEXT:    s_nop 1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[134:135], v[0:15]
-  ; GCN-NEXT:    v_exp_f32_e32 v142, v146
   ; GCN-NEXT:    ds_read_b128 v[68:71], v197
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_exp_f32_e32 v142, v146
   ; GCN-NEXT:    ds_read_b128 v[64:67], v197 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll
index 51bcb393dfd4f..8c31a2fc0ff1c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll
@@ -21,8 +21,8 @@ define amdgpu_kernel void @MFMAExpInterleave(ptr addrspace(1) %out0, ptr addrspa
 ; GCN-NEXT:    v_add_f32_e32 v5, v8, v5
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v6, v7
 ; GCN-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v4, v[0:3]
-; GCN-NEXT:    v_exp_f32_e32 v5, v5
 ; GCN-NEXT:    s_mov_b32 s0, 0x3fb8aa3b
+; GCN-NEXT:    v_exp_f32_e32 v5, v5
 ; GCN-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v4, v[0:3]
 ; GCN-NEXT:    ; iglp_opt mask(0x00000003)
 ; GCN-NEXT:    v_ldexp_f32 v5, v5, v6
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
index 0a8d7acd187fc..271a646a9dc8e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
@@ -69,15 +69,15 @@
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], 0
   ; GCN-NEXT:    ; kill: killed $vgpr1
   ; GCN-NEXT:    ; kill: killed $vgpr0
+  ; GCN-NEXT:    ; implicit-def: $sgpr5
+  ; GCN-NEXT:    ; implicit-def: $sgpr2
+  ; GCN-NEXT:    ; implicit-def: $sgpr3
+  ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1
   ; GCN-NEXT:    v_mul_lo_u32 v76, v76, s6
   ; GCN-NEXT:    v_add_lshl_u32 v76, v77, v76, 1
   ; GCN-NEXT:    v_lshl_add_u32 v77, v78, 1, v76
-  ; GCN-NEXT:    ; implicit-def: $sgpr5
   ; GCN-NEXT:    v_lshl_add_u32 v78, v79, 1, v77
-  ; GCN-NEXT:    ; implicit-def: $sgpr2
-  ; GCN-NEXT:    ; implicit-def: $sgpr3
   ; GCN-NEXT:    v_lshl_add_u32 v79, v80, 1, v78
-  ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], 0
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31]
   ; GCN-NEXT:    ds_read_b128 v[36:39], v51
@@ -133,8 +133,8 @@
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[66:67], v[62:63], v[0:15]
   ; GCN-NEXT:    ; implicit-def: $vgpr66
   ; GCN-NEXT:    ; implicit-def: $vgpr67
-  ; GCN-NEXT:    v_max_f32_e32 v81, v67, v67
   ; GCN-NEXT:    ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+  ; GCN-NEXT:    v_max_f32_e32 v81, v67, v67
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[70:71], v[64:65], v[16:31]
   ; GCN-NEXT:    v_perm_b32 v70, v74, v72, s2
   ; GCN-NEXT:    v_perm_b32 v71, v74, v72, s3
@@ -148,13 +148,13 @@
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b32 v78, v72
-  ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v20
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15]
   ; GCN-NEXT:    v_mul_f32_e32 v64, s4, v16
   ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v17
   ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v18
   ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v19
   ; GCN-NEXT:    v_max3_f32 v64, v64, s5, v65
+  ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v20
   ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v21
   ; GCN-NEXT:    v_max3_f32 v64, v64, v68, v69
   ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v22
@@ -332,14 +332,18 @@
   ; GCN-NEXT:    v_mul_f32_e32 v64, 0x3fb8aa3b, v73
   ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v87
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63]
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    v_add_f32_e32 v17, v74, v17
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v85
   ; GCN-NEXT:    v_fma_f32 v2, s4, v2, -v72
   ; GCN-NEXT:    v_exp_f32_e32 v22, v64
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v88
   ; GCN-NEXT:    v_exp_f32_e32 v64, v65
-  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[24:25], v[18:19], v[32:47]
+  ; GCN-NEXT:    ; implicit-def: $sgpr2
+  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
   ; GCN-NEXT:    v_add_f32_e32 v17, v75, v17
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v30
   ; GCN-NEXT:    v_fma_f32 v24, s4, v3, -v72
@@ -366,9 +370,6 @@
   ; GCN-NEXT:    v_fma_f32 v7, s4, v7, -v72
   ; GCN-NEXT:    v_exp_f32_e32 v68, v2
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v25
-  ; GCN-NEXT:    ;;#ASMSTART
-  ; GCN-NEXT:    s_waitcnt vmcnt(8)
-  ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v24
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_read_b128 v[0:3], v84
@@ -389,8 +390,8 @@
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v29, v65
   ; GCN-NEXT:    v_fma_f32 v10, s4, v10, -v72
   ; GCN-NEXT:    v_exp_f32_e32 v67, v67
-  ; GCN-NEXT:    v_mul_f32_e32 v6, 0x3fb8aa3b, v6
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[70:71], v[4:5], v[32:47]
+  ; GCN-NEXT:    v_mul_f32_e32 v6, 0x3fb8aa3b, v6
   ; GCN-NEXT:    v_add_f32_e32 v17, v83, v17
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v68
   ; GCN-NEXT:    v_exp_f32_e32 v6, v6
@@ -399,7 +400,6 @@
   ; GCN-NEXT:    v_exp_f32_e32 v7, v7
   ; GCN-NEXT:    v_pack_b32_f16 v4, v28, v29
   ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v69
-  ; GCN-NEXT:    ; implicit-def: $sgpr2
   ; GCN-NEXT:    s_nop 1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[4:5], v[48:63]
   ; GCN-NEXT:    v_add_f32_e32 v0, v85, v17
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index a1fe463de1c54..a83b17cfce800 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -3004,17 +3004,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a16, s8
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[4:7], v[8:11], a[16:31]
+; HEURRC-NEXT:    s_nop 11
+; HEURRC-NEXT:    global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], 16
 ; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], 0
 ; HEURRC-NEXT:    v_mov_b32_e32 v8, s16
 ; HEURRC-NEXT:    v_mov_b32_e32 v9, s17
 ; HEURRC-NEXT:    v_mov_b32_e32 v10, s18
 ; HEURRC-NEXT:    v_mov_b32_e32 v11, s19
-; HEURRC-NEXT:    s_nop 5
-; HEURRC-NEXT:    global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
-; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
-; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
@@ -3405,17 +3405,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a16, s8
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[4:7], v[8:11], a[16:31] cbsz:2 abid:3 blgp:1
+; HEURRC-NEXT:    s_nop 11
+; HEURRC-NEXT:    global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], 16
 ; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], 0
 ; HEURRC-NEXT:    v_mov_b32_e32 v8, s16
 ; HEURRC-NEXT:    v_mov_b32_e32 v9, s17
 ; HEURRC-NEXT:    v_mov_b32_e32 v10, s18
 ; HEURRC-NEXT:    v_mov_b32_e32 v11, s19
-; HEURRC-NEXT:    s_nop 5
-; HEURRC-NEXT:    global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
-; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
-; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll
index 9436b498205d4..3fd6d6386a17a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll
@@ -24,10 +24,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-MINREG-NEXT:    ds_read_b128 v[12:15], v36 offset:48
 ; GCN-MINREG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-MINREG-NEXT:    v_mfma_f32_32x32x1f32 v[0:31], v34, v33, v[0:31]
-; GCN-MINREG-NEXT:    v_mov_b32_e32 v32, s1
-; GCN-MINREG-NEXT:    v_add_u32_e32 v35, 0x6000, v36
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-MINREG-NEXT:    v_mov_b32_e32 v32, s1
+; GCN-MINREG-NEXT:    v_add_u32_e32 v35, 0x6000, v36
 ; GCN-MINREG-NEXT:    s_nop 15
 ; GCN-MINREG-NEXT:    s_nop 0
 ; GCN-MINREG-NEXT:    ds_write_b128 v37, v[28:31] offset:112
@@ -152,9 +152,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-MAXOCC-NEXT:    ds_read_b128 v[12:15], v32 offset:48
 ; GCN-MAXOCC-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-MAXOCC-NEXT:    v_mfma_f32_32x32x1f32 v[0:31], v34, v35, v[0:31]
-; GCN-MAXOCC-NEXT:    v_add_u32_e32 v33, s1, v33
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-MAXOCC-NEXT:    v_add_u32_e32 v33, s1, v33
 ; GCN-MAXOCC-NEXT:    s_nop 15
 ; GCN-MAXOCC-NEXT:    s_nop 1
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v33, v[28:31] offset:112
@@ -175,10 +175,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-MAXOCC-NEXT:    ds_read_b128 v[0:3], v32 offset:8192
 ; GCN-MAXOCC-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-MAXOCC-NEXT:    v_mfma_f32_32x32x1f32 v[0:31], v34, v35, v[0:31]
-; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v33, s1
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v33, s1
 ; GCN-MAXOCC-NEXT:    s_nop 15
 ; GCN-MAXOCC-NEXT:    s_nop 1
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v33, v[24:27] offset:8288
@@ -222,10 +222,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-MAXOCC-NEXT:    ds_read_b128 v[0:3], v32 offset:49152
 ; GCN-MAXOCC-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-MAXOCC-NEXT:    v_mfma_f32_32x32x1f32 v[0:31], v34, v35, v[0:31]
-; GCN-MAXOCC-NEXT:    v_add_u32_e32 v32, 0x6000, v32
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-MAXOCC-NEXT:    v_add_u32_e32 v32, 0x6000, v32
 ; GCN-MAXOCC-NEXT:    s_nop 15
 ; GCN-MAXOCC-NEXT:    s_nop 1
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v33, v[24:27] offset:24672
@@ -281,9 +281,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-ILP-NEXT:    ds_read_b128 v[28:31], v35 offset:112
 ; GCN-ILP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-ILP-NEXT:    v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31]
-; GCN-ILP-NEXT:    v_add_u32_e32 v32, s1, v32
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-ILP-NEXT:    v_add_u32_e32 v32, s1, v32
 ; GCN-ILP-NEXT:    s_nop 15
 ; GCN-ILP-NEXT:    s_nop 1
 ; GCN-ILP-NEXT:    ds_write_b128 v32, v[28:31] offset:112
@@ -494,10 +494,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-MINREG-NEXT:    ds_read_b128 v[0:3], v35 offset:8192
 ; GCN-MINREG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-MINREG-NEXT:    v_mfma_f32_32x32x1f32 v[0:31], v33, v32, v[0:31]
-; GCN-MINREG-NEXT:    v_mov_b32_e32 v34, s1
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-MINREG-NEXT:    v_mov_b32_e32 v34, s1
 ; GCN-MINREG-NEXT:    s_nop 15
 ; GCN-MINREG-NEXT:    s_nop 1
 ; GCN-MINREG-NEXT:    ds_write_b128 v34, v[24:27] offset:8288
@@ -520,9 +520,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-MINREG-NEXT:    ds_read_b128 v[12:15], v35 offset:24624
 ; GCN-MINREG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-MINREG-NEXT:    v_mfma_f32_32x32x1f32 v[0:31], v33, v32, v[0:31]
-; GCN-MINREG-NEXT:    v_add_u32_e32 v36, 0x6000, v35
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-MINREG-NEXT:    v_add_u32_e32 v36, 0x6000, v35
 ; GCN-MINREG-NEXT:    s_nop 15
 ; GCN-MINREG-NEXT:    s_nop 1
 ; GCN-MINREG-NEXT:    ds_write_b128 v34, v[28:31] offset:16496
@@ -601,9 +601,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-MAXOCC-NEXT:    ds_read_b128 v[12:15], v32 offset:48
 ; GCN-MAXOCC-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-MAXOCC-NEXT:    v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31]
-; GCN-MAXOCC-NEXT:    v_add_u32_e32 v35, s1, v35
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-MAXOCC-NEXT:    v_add_u32_e32 v35, s1, v35
 ; GCN-MAXOCC-NEXT:    s_nop 15
 ; GCN-MAXOCC-NEXT:    s_nop 1
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v35, v[28:31] offset:112
@@ -624,10 +624,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-MAXOCC-NEXT:    ds_read_b128 v[0:3], v32 offset:8192
 ; GCN-MAXOCC-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-MAXOCC-NEXT:    v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31]
-; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v35, s1
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v35, s1
 ; GCN-MAXOCC-NEXT:    s_nop 15
 ; GCN-MAXOCC-NEXT:    s_nop 1
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v35, v[24:27] offset:8288
@@ -672,10 +672,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-MAXOCC-NEXT:    ds_read_b128 v[0:3], v32 offset:49152
 ; GCN-MAXOCC-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-MAXOCC-NEXT:    v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31]
-; GCN-MAXOCC-NEXT:    v_add_u32_e32 v32, 0x6000, v32
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-MAXOCC-NEXT:    v_add_u32_e32 v32, 0x6000, v32
 ; GCN-MAXOCC-NEXT:    s_nop 15
 ; GCN-MAXOCC-NEXT:    s_nop 1
 ; GCN-MAXOCC-NEXT:    ds_write_b128 v35, v[28:31] offset:24688
@@ -731,9 +731,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-ILP-NEXT:    ds_read_b128 v[28:31], v35 offset:112
 ; GCN-ILP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-ILP-NEXT:    v_mfma_f32_32x32x1f32 v[0:31], v32, v33, v[0:31]
-; GCN-ILP-NEXT:    v_add_u32_e32 v34, s1, v34
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-ILP-NEXT:    v_add_u32_e32 v34, s1, v34
 ; GCN-ILP-NEXT:    s_nop 15
 ; GCN-ILP-NEXT:    s_nop 1
 ; GCN-ILP-NEXT:    ds_write_b128 v34, v[0:3]
@@ -754,10 +754,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-ILP-NEXT:    ds_read_b128 v[28:31], v35 offset:8304
 ; GCN-ILP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-ILP-NEXT:    v_mfma_f32_32x32x1f32 v[0:31], v32, v33, v[0:31]
-; GCN-ILP-NEXT:    v_mov_b32_e32 v34, s1
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-ILP-NEXT:    v_mov_b32_e32 v34, s1
 ; GCN-ILP-NEXT:    s_nop 15
 ; GCN-ILP-NEXT:    s_nop 1
 ; GCN-ILP-NEXT:    ds_write_b128 v34, v[24:27] offset:8288
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
index 996c76488080f..88beb4c3f5105 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
@@ -670,8 +670,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
 ; GCN-NEXT:    ds_read_b128 a[140:143], v4 offset:57392
 ; GCN-NEXT:    s_waitcnt lgkmcnt(14)
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63]
-; GCN-NEXT:    v_add_u32_e32 v0, s1, v0
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(40) SyncID(0)
+; GCN-NEXT:    v_add_u32_e32 v0, s1, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(8)
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127]
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95]
@@ -776,8 +776,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[140:143], v4 offset:57392
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(14)
 ; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63]
-; EXACTCUTOFF-NEXT:    v_add_u32_e32 v0, s1, v0
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(40) SyncID(0)
+; EXACTCUTOFF-NEXT:    v_add_u32_e32 v0, s1, v0
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(8)
 ; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127]
 ; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95]
@@ -885,9 +885,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-NEXT:    ds_read_b128 a[12:15], v1 offset:48
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GCN-NEXT:    v_add_u32_e32 v0, s1, v0
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-NEXT:    v_add_u32_e32 v0, s1, v0
 ; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    ds_write_b128 v0, a[28:31] offset:112
@@ -908,10 +908,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-NEXT:    ds_read_b128 a[0:3], v1 offset:8192
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    ds_write_b128 v0, a[24:27] offset:8288
@@ -955,10 +955,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-NEXT:    ds_read_b128 a[0:3], v1 offset:49152
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GCN-NEXT:    v_add_u32_e32 v1, 0x6000, v1
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-NEXT:    v_add_u32_e32 v1, 0x6000, v1
 ; GCN-NEXT:    s_nop 15
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    ds_write_b128 v0, a[24:27] offset:24672
@@ -1014,9 +1014,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[12:15], v1 offset:48
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; EXACTCUTOFF-NEXT:    v_add_u32_e32 v0, s1, v0
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    v_add_u32_e32 v0, s1, v0
 ; EXACTCUTOFF-NEXT:    s_nop 15
 ; EXACTCUTOFF-NEXT:    s_nop 1
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[28:31] offset:112
@@ -1037,10 +1037,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[0:3], v1 offset:8192
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v0, s1
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v0, s1
 ; EXACTCUTOFF-NEXT:    s_nop 15
 ; EXACTCUTOFF-NEXT:    s_nop 1
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[24:27] offset:8288
@@ -1084,10 +1084,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[0:3], v1 offset:49152
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; EXACTCUTOFF-NEXT:    v_add_u32_e32 v1, 0x6000, v1
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    v_add_u32_e32 v1, 0x6000, v1
 ; EXACTCUTOFF-NEXT:    s_nop 15
 ; EXACTCUTOFF-NEXT:    s_nop 1
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[24:27] offset:24672
@@ -1227,10 +1227,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
 ; GCN-NEXT:    ds_read_b128 a[28:31], v1 offset:8304
 ; GCN-NEXT:    s_waitcnt lgkmcnt(1)
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v9, v4, a[32:63]
+; GCN-NEXT:    ds_read_b128 a[24:27], v1 offset:8288
 ; GCN-NEXT:    v_add_f32_e32 v4, v12, v10
 ; GCN-NEXT:    v_exp_f32_e32 v4, v4
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v10, v11
-; GCN-NEXT:    ds_read_b128 a[24:27], v1 offset:8288
 ; GCN-NEXT:    ds_read_b128 a[20:23], v1 offset:8272
 ; GCN-NEXT:    ds_read_b128 a[16:19], v1 offset:8256
 ; GCN-NEXT:    ds_read_b128 a[12:15], v1 offset:8240
@@ -1246,13 +1246,13 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
 ; GCN-NEXT:    v_rndne_f32_e32 v11, v10
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v9, v4, a[0:31]
+; GCN-NEXT:    ds_read_b128 a[92:95], v1 offset:24688
 ; GCN-NEXT:    v_fma_f32 v4, s2, v3, -v10
 ; GCN-NEXT:    v_sub_f32_e32 v12, v10, v11
 ; GCN-NEXT:    v_fmac_f32_e32 v4, s2, v7
 ; GCN-NEXT:    v_add_f32_e32 v4, v12, v4
 ; GCN-NEXT:    v_exp_f32_e32 v4, v4
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v10, v11
-; GCN-NEXT:    ds_read_b128 a[92:95], v1 offset:24688
 ; GCN-NEXT:    ds_read_b128 a[88:91], v1 offset:24672
 ; GCN-NEXT:    ds_read_b128 a[84:87], v1 offset:24656
 ; GCN-NEXT:    ds_read_b128 a[80:83], v1 offset:24640
@@ -1279,6 +1279,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
 ; GCN-NEXT:    s_load_dword s8, s[4:5], 0x54
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[64:95], v9, v1, a[64:95]
+; GCN-NEXT:    ds_read_b128 a[156:159], v2 offset:57456
 ; GCN-NEXT:    v_sub_f32_e32 v1, v4, v10
 ; GCN-NEXT:    v_fma_f32 v4, s3, v3, -v4
 ; GCN-NEXT:    v_fmac_f32_e32 v4, s3, v7
@@ -1286,8 +1287,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
 ; GCN-NEXT:    v_exp_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v10
 ; GCN-NEXT:    v_cmp_nlt_f32_e32 vcc, s3, v5
-; GCN-NEXT:    ds_read_b128 a[156:159], v2 offset:57456
 ; GCN-NEXT:    ds_read_b128 a[152:155], v2 offset:57440
+; GCN-NEXT:    ds_read_b128 a[148:151], v2 offset:57424
 ; GCN-NEXT:    v_ldexp_f32 v1, v1, v4
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GCN-NEXT:    v_cmp_ngt_f32_e32 vcc, s3, v6
@@ -1295,14 +1296,13 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
 ; GCN-NEXT:    v_mul_f32_e32 v4, s8, v3
 ; GCN-NEXT:    v_fma_f32 v3, s8, v3, -v4
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[96:127], v9, v1, a[96:127]
+; GCN-NEXT:    ds_read_b128 a[144:147], v2 offset:57408
 ; GCN-NEXT:    v_rndne_f32_e32 v1, v4
 ; GCN-NEXT:    v_sub_f32_e32 v10, v4, v1
 ; GCN-NEXT:    v_fmac_f32_e32 v3, s8, v7
 ; GCN-NEXT:    v_add_f32_e32 v3, v10, v3
 ; GCN-NEXT:    v_exp_f32_e32 v3, v3
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GCN-NEXT:    ds_read_b128 a[148:151], v2 offset:57424
-; GCN-NEXT:    ds_read_b128 a[144:147], v2 offset:57408
 ; GCN-NEXT:    ds_read_b128 a[128:131], v2 offset:57344
 ; GCN-NEXT:    ds_read_b128 a[132:135], v2 offset:57360
 ; GCN-NEXT:    ds_read_b128 a[136:139], v2 offset:57376
@@ -1412,10 +1412,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[28:31], v1 offset:8304
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(1)
 ; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v9, v4, a[32:63]
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[24:27], v1 offset:8288
 ; EXACTCUTOFF-NEXT:    v_add_f32_e32 v4, v12, v10
 ; EXACTCUTOFF-NEXT:    v_exp_f32_e32 v4, v4
 ; EXACTCUTOFF-NEXT:    v_cvt_i32_f32_e32 v10, v11
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[24:27], v1 offset:8288
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[20:23], v1 offset:8272
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[16:19], v1 offset:8256
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[12:15], v1 offset:8240
@@ -1431,13 +1431,13 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
 ; EXACTCUTOFF-NEXT:    v_rndne_f32_e32 v11, v10
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v9, v4, a[0:31]
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[92:95], v1 offset:24688
 ; EXACTCUTOFF-NEXT:    v_fma_f32 v4, s2, v3, -v10
 ; EXACTCUTOFF-NEXT:    v_sub_f32_e32 v12, v10, v11
 ; EXACTCUTOFF-NEXT:    v_fmac_f32_e32 v4, s2, v7
 ; EXACTCUTOFF-NEXT:    v_add_f32_e32 v4, v12, v4
 ; EXACTCUTOFF-NEXT:    v_exp_f32_e32 v4, v4
 ; EXACTCUTOFF-NEXT:    v_cvt_i32_f32_e32 v10, v11
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[92:95], v1 offset:24688
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[88:91], v1 offset:24672
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[84:87], v1 offset:24656
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[80:83], v1 offset:24640
@@ -1464,6 +1464,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
 ; EXACTCUTOFF-NEXT:    s_load_dword s8, s[4:5], 0x54
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[64:95], v9, v1, a[64:95]
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[156:159], v2 offset:57456
 ; EXACTCUTOFF-NEXT:    v_sub_f32_e32 v1, v4, v10
 ; EXACTCUTOFF-NEXT:    v_fma_f32 v4, s3, v3, -v4
 ; EXACTCUTOFF-NEXT:    v_fmac_f32_e32 v4, s3, v7
@@ -1471,8 +1472,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
 ; EXACTCUTOFF-NEXT:    v_exp_f32_e32 v1, v1
 ; EXACTCUTOFF-NEXT:    v_cvt_i32_f32_e32 v4, v10
 ; EXACTCUTOFF-NEXT:    v_cmp_nlt_f32_e32 vcc, s3, v5
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[156:159], v2 offset:57456
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[152:155], v2 offset:57440
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[148:151], v2 offset:57424
 ; EXACTCUTOFF-NEXT:    v_ldexp_f32 v1, v1, v4
 ; EXACTCUTOFF-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; EXACTCUTOFF-NEXT:    v_cmp_ngt_f32_e32 vcc, s3, v6
@@ -1480,14 +1481,13 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
 ; EXACTCUTOFF-NEXT:    v_mul_f32_e32 v4, s8, v3
 ; EXACTCUTOFF-NEXT:    v_fma_f32 v3, s8, v3, -v4
 ; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[96:127], v9, v1, a[96:127]
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[144:147], v2 offset:57408
 ; EXACTCUTOFF-NEXT:    v_rndne_f32_e32 v1, v4
 ; EXACTCUTOFF-NEXT:    v_sub_f32_e32 v10, v4, v1
 ; EXACTCUTOFF-NEXT:    v_fmac_f32_e32 v3, s8, v7
 ; EXACTCUTOFF-NEXT:    v_add_f32_e32 v3, v10, v3
 ; EXACTCUTOFF-NEXT:    v_exp_f32_e32 v3, v3
 ; EXACTCUTOFF-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[148:151], v2 offset:57424
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[144:147], v2 offset:57408
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[128:131], v2 offset:57344
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[132:135], v2 offset:57360
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[136:139], v2 offset:57376
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
index 47ebd072c4cc7..4ad397d44ffd3 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
@@ -384,15 +384,15 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 {
 ; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[4:7], v[26:27], v[26:27], v[0:3]
 ; CHECK-NEXT:    v_mov_b64_e32 v[28:29], s[0:1]
 ; CHECK-NEXT:    s_mov_b32 s0, 0x7e007e00
-; CHECK-NEXT:    s_mov_b32 s1, s0
 ; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[4:7], v[26:27], v[26:27], v[4:7]
+; CHECK-NEXT:    s_mov_b32 s1, s0
 ; CHECK-NEXT:    v_mov_b64_e32 v[30:31], s[0:1]
 ; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[28:29], v[0:3]
 ; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[6:9]
-; CHECK-NEXT:    s_nop 3
+; CHECK-NEXT:    s_nop 2
 ; CHECK-NEXT:    v_cvt_f16_f32_e32 v24, v4
 ; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[12:15], v[26:27], v[30:31], v[0:3]
-; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    s_nop 1
 ; CHECK-NEXT:    v_mov_b32_e32 v8, 0x7fc00000
 ; CHECK-NEXT:    v_mov_b32_e32 v9, v8
 ; CHECK-NEXT:    v_mov_b32_e32 v10, v8
@@ -715,9 +715,10 @@ define void @test_rewrite_mfma_copy_from_agpr_class_f64_4x4x4f64_chain(double %a
 ; CHECK-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0
-; CHECK-NEXT:    v_lshl_add_u64 v[2:3], v[8:9], 0, v[2:3]
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    v_mfma_f64_4x4x4_4b_f64 a[0:1], v[4:5], v[6:7], a[0:1]
-; CHECK-NEXT:    s_nop 8
+; CHECK-NEXT:    v_lshl_add_u64 v[2:3], v[8:9], 0, v[2:3]
+; CHECK-NEXT:    s_nop 7
 ; CHECK-NEXT:    global_store_dwordx2 v[2:3], a[0:1], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]

>From 02678630d2bf7d1259aecef66dc0119b8a3f0880 Mon Sep 17 00:00:00 2001
From: Anshil Gandhi <95053726+gandhi56 at users.noreply.github.com>
Date: Mon, 2 Mar 2026 15:57:49 +0530
Subject: [PATCH 2/2] Apply suggestion from @Copilot

Co-authored-by: Copilot <175728472+Copilot at users.noreply.github.com>
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 30 +++++++++----------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 993bd65a08daa..04ad98df93258 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -2477,22 +2477,20 @@ int GCNHazardRecognizer::checkVALUImmediatelyAfterMFMAHazard(MachineInstr *MI) {
 
   assert(SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMFMA(*MI));
 
-  // Find the most recently emitted instruction. Count leading nullptrs (stall
-  // cycles); one or more means we've had at least one cycle of separation.
-  int LeadingNullptrs = 0;
-  for (MachineInstr *EmittedMI : EmittedInstrs) {
-    if (!EmittedMI) {
-      ++LeadingNullptrs;
-      continue;
-    }
-    // Found the most recent instruction.
-    if (LeadingNullptrs >= 1)
-      return 0; // At least one stall cycle since MFMA, separation achieved
-    if (SIInstrInfo::isMFMA(*EmittedMI))
-      return 1; // MFMA is most recent with no intervening cycle, block VALU
-    return 0;   // Most recent is not MFMA
-  }
-  return 0; // Empty, no preceding MFMA
+  // The hazard exists only when a VALU is immediately after an MFMA with
+  // no intervening stall cycle (nullptr entry). Therefore, only the first
+  // entry in EmittedInstrs matters.
+  if (EmittedInstrs.empty())
+    return 0; // No preceding instructions.
+
+  MachineInstr *Last = EmittedInstrs.front();
+  if (!Last)
+    return 0; // At least one stall cycle since any prior instruction.
+
+  if (SIInstrInfo::isMFMA(*Last))
+    return 1; // MFMA is most recent with no intervening cycle, block VALU.
+
+  return 0; // Most recent instruction is not an MFMA.
 }
 
 int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {



More information about the llvm-commits mailing list