[llvm-branch-commits] [llvm] 7f6577c - Revert "[AMDGPU] Widen MUBUF/MTBUF source-vgpr WAR hazard on gfx940-family to…"

via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Sat Jun 13 08:24:52 PDT 2026


Author: theRonShark
Date: 2026-06-13T11:24:47-04:00
New Revision: 7f6577cc1acb0ab80696e2314dc69a0f7242cf25

URL: https://github.com/llvm/llvm-project/commit/7f6577cc1acb0ab80696e2314dc69a0f7242cf25
DIFF: https://github.com/llvm/llvm-project/commit/7f6577cc1acb0ab80696e2314dc69a0f7242cf25.diff

LOG: Revert "[AMDGPU] Widen MUBUF/MTBUF source-vgpr WAR hazard on gfx940-family to…"

This reverts commit 62b7cf9623fc310525f39ed69aaecc318a909731.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

Removed: 
    llvm/test/CodeGen/AMDGPU/buffer-store-dwordx4-vpk-mul-war-hazard-gfx942.mir


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 25a7ca7d957b0..91329473fca6f 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -895,18 +895,15 @@ int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) const {
     // (like wbinvl1)
     if (VDataIdx == -1)
       return -1;
-    if (AMDGPU::getRegBitWidth(VDataRCID) > 64) {
-      // On gfx940-family the BUFFER_STORE source-vgpr WAR hazard exists for
-      // every SOFFSET shape; the wait-state count 
diff ers by SOFFSET, and is
-      // computed in checkVALUHazardsHelper. Pre-gfx940 the hazard only exists
-      // if soffset is not an SGPR.
-      if (ST.hasGFX940Insts())
-        return VDataIdx;
-      const MachineOperand *SOffset =
-          TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
-      if (!SOffset || !SOffset->isReg())
-        return VDataIdx;
-    }
+    // For MUBUF/MTBUF instructions this hazard only exists if the
+    // instruction is not using a register in the soffset field.
+    const MachineOperand *SOffset =
+        TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
+    // If we have no soffset operand, then assume this field has been
+    // hardcoded to zero.
+    if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
+        (!SOffset || !SOffset->isReg()))
+      return VDataIdx;
   }
 
   // MIMG instructions create a hazard if they don't use a 256-bit T# and
@@ -934,59 +931,25 @@ int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) const {
 
 int GCNHazardRecognizer::checkVALUHazardsHelper(
     const MachineOperand &Def, const MachineRegisterInfo &MRI) const {
-  // Helper to check for the hazard where VMEM instructions that store more
-  // than 8 bytes can have their store data overwritten by the next
-  // instruction. On gfx940-family the window depends on the producer's
-  // SOFFSET shape:
-  //   - MUBUF/MTBUF wide store with sgpr SOFFSET: 1 wait state.
-  //   - MUBUF/MTBUF wide store with literal/absent SOFFSET, and FLAT wide
-  //     store: 2 wait states.
-  // Pre-gfx940 keeps a single 1-wait-state window. The 1-cycle sgpr-SOFFSET
-  // window was measured on gfx950 (MI350X); the same gate is applied to the
-  // rest of the gfx940 family to match the existing rule's granularity.
+  // Helper to check for the hazard where VMEM instructions that store more than
+  // 8 bytes can have there store data over written by the next instruction.
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
-  const SIInstrInfo *TII = ST.getInstrInfo();
 
+  const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
   int WaitStatesNeeded = 0;
+
   if (!TRI->isVectorRegister(MRI, Def.getReg()))
     return WaitStatesNeeded;
-  const Register Reg = Def.getReg();
-
-  const int MaxWaitStates = ST.hasGFX940Insts() ? 2 : 1;
-
-  // Per-producer required wait-state window. On pre-gfx940 every producer
-  // uses 1; on gfx940-family MUBUF/MTBUF stores with an SGPR SOFFSET use 1
-  // and everything else (literal/absent SOFFSET, FLAT) uses 2.
-  auto WindowFor = [this, TII](const MachineInstr &MI) -> int {
-    if (!ST.hasGFX940Insts())
-      return 1;
-    if (TII->isBUF(MI)) {
-      const MachineOperand *SOffset =
-          TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
-      if (SOffset && SOffset->isReg())
-        return 1;
-    }
-    return 2;
-  };
-
-  // For each hazard producer reached, accumulate the wait states still
-  // needed using that producer's own window. The predicate always returns
-  // false so the walk runs to MaxWaitStates.
-  int Distance = 0;
-  auto Counter = [&](const MachineInstr &MI) {
+  Register Reg = Def.getReg();
+  auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
     int DataIdx = createsVALUHazard(MI);
-    if (DataIdx >= 0 &&
-        TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg)) {
-      int Need = WindowFor(MI) - Distance;
-      WaitStatesNeeded = std::max(WaitStatesNeeded, Need);
-    }
-    // Mirror getWaitStatesSince's accounting, which does not count inline asm
-    // towards the wait-state distance.
-    if (!MI.isInlineAsm())
-      Distance += SIInstrInfo::getNumWaitStates(MI);
-    return false;
+    return DataIdx >= 0 &&
+           TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
   };
-  getWaitStatesSince(Counter, MaxWaitStates);
+
+  int WaitStatesNeededForDef =
+    VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
+  WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
 
   return WaitStatesNeeded;
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/buffer-store-dwordx4-vpk-mul-war-hazard-gfx942.mir b/llvm/test/CodeGen/AMDGPU/buffer-store-dwordx4-vpk-mul-war-hazard-gfx942.mir
deleted file mode 100644
index db8b47f5cab5f..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/buffer-store-dwordx4-vpk-mul-war-hazard-gfx942.mir
+++ /dev/null
@@ -1,122 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
-# RUN: llc -mtriple=amdgcn -mcpu=gfx950 -run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,GFX950 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,GFX942 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,GFX900 %s
-
-# Tests for the BUFFER_STORE source-vgpr WAR hazard on gfx940-family
-# subtargets (gfx942, gfx950).
-#
-# A buffer_store_dwordx4 (or any MUBUF store wider than 8 bytes) immediately
-# followed by a VALU instruction that overwrites two of the source vgprs
-# (typically v_pk_mul_f32 / v_pk_add_f32 writing v[X:X+1]) loses the bytes
-# of one dword to the post-write value on gfx940 family. The required
-# wait-state window depends on the producer's SOFFSET shape:
-#   - SGPR-sourced SOFFSET: 1 wait state  -> S_NOP 0
-#   - literal/absent SOFFSET: 2 wait states -> S_NOP 1
-# Empirical measurement on gfx950 (MI350X); the same rule is applied to
-# the rest of the gfx940 family to match the existing recognizer's
-# granularity.
-#
-# Pre-gfx940 the hazard existed only with literal/absent SOFFSET; the
-# pre-existing rule keeps its narrow gating for those targets.
-
----
-name: buffer_store_dwordx4_sgpr_soffset_then_vpk_mul
-tracksRegLiveness: true
-body: |
-  bb.0:
-    liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr8_vgpr9, $sgpr0_sgpr1, $sgpr1, $sgpr8_sgpr9_sgpr10_sgpr11
-    ; GFX950-LABEL: name: buffer_store_dwordx4_sgpr_soffset_then_vpk_mul
-    ; GFX950: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr8_vgpr9, $sgpr0_sgpr1, $sgpr1, $sgpr8_sgpr9_sgpr10_sgpr11
-    ; GFX950-NEXT: {{  $}}
-    ; GFX950-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr1, 0, 0, 0, implicit $exec
-    ; GFX950-NEXT: S_NOP 0
-    ; GFX950-NEXT: $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $sgpr0_sgpr1, 8, $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX950-NEXT: S_ENDPGM 0
-    ;
-    ; GFX942-LABEL: name: buffer_store_dwordx4_sgpr_soffset_then_vpk_mul
-    ; GFX942: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr8_vgpr9, $sgpr0_sgpr1, $sgpr1, $sgpr8_sgpr9_sgpr10_sgpr11
-    ; GFX942-NEXT: {{  $}}
-    ; GFX942-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr1, 0, 0, 0, implicit $exec
-    ; GFX942-NEXT: S_NOP 0
-    ; GFX942-NEXT: $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $sgpr0_sgpr1, 8, $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942-NEXT: S_ENDPGM 0
-    ;
-    ; GFX900-LABEL: name: buffer_store_dwordx4_sgpr_soffset_then_vpk_mul
-    ; GFX900: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr8_vgpr9, $sgpr0_sgpr1, $sgpr1, $sgpr8_sgpr9_sgpr10_sgpr11
-    ; GFX900-NEXT: {{  $}}
-    ; GFX900-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr1, 0, 0, 0, implicit $exec
-    ; GFX900-NEXT: $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $sgpr0_sgpr1, 8, $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX900-NEXT: S_ENDPGM 0
-    BUFFER_STORE_DWORDX4_OFFEN_exact $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr1, 0, 0, 0, implicit $exec
-    $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $sgpr0_sgpr1, 8, $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    S_ENDPGM 0
-...
-
----
-name: buffer_store_dwordx4_literal_soffset_then_vpk_mul
-tracksRegLiveness: true
-body: |
-  bb.0:
-    liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr8_vgpr9, $sgpr0_sgpr1, $sgpr8_sgpr9_sgpr10_sgpr11
-    ; GFX950-LABEL: name: buffer_store_dwordx4_literal_soffset_then_vpk_mul
-    ; GFX950: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr8_vgpr9, $sgpr0_sgpr1, $sgpr8_sgpr9_sgpr10_sgpr11
-    ; GFX950-NEXT: {{  $}}
-    ; GFX950-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec
-    ; GFX950-NEXT: S_NOP 1
-    ; GFX950-NEXT: $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $sgpr0_sgpr1, 8, $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX950-NEXT: S_ENDPGM 0
-    ;
-    ; GFX942-LABEL: name: buffer_store_dwordx4_literal_soffset_then_vpk_mul
-    ; GFX942: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr8_vgpr9, $sgpr0_sgpr1, $sgpr8_sgpr9_sgpr10_sgpr11
-    ; GFX942-NEXT: {{  $}}
-    ; GFX942-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec
-    ; GFX942-NEXT: S_NOP 1
-    ; GFX942-NEXT: $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $sgpr0_sgpr1, 8, $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942-NEXT: S_ENDPGM 0
-    ;
-    ; GFX900-LABEL: name: buffer_store_dwordx4_literal_soffset_then_vpk_mul
-    ; GFX900: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr8_vgpr9, $sgpr0_sgpr1, $sgpr8_sgpr9_sgpr10_sgpr11
-    ; GFX900-NEXT: {{  $}}
-    ; GFX900-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec
-    ; GFX900-NEXT: S_NOP 0
-    ; GFX900-NEXT: $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $sgpr0_sgpr1, 8, $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX900-NEXT: S_ENDPGM 0
-    BUFFER_STORE_DWORDX4_OFFEN_exact $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec
-    $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $sgpr0_sgpr1, 8, $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    S_ENDPGM 0
-...
-
----
-name: buffer_store_dwordx2_sgpr_soffset_then_vpk_mul_no_hazard
-tracksRegLiveness: true
-body: |
-  bb.0:
-    liveins: $vgpr0_vgpr1, $vgpr4, $vgpr8_vgpr9, $sgpr0_sgpr1, $sgpr1, $sgpr8_sgpr9_sgpr10_sgpr11
-    ; GCN-LABEL: name: buffer_store_dwordx2_sgpr_soffset_then_vpk_mul_no_hazard
-    ; GCN: liveins: $vgpr0_vgpr1, $vgpr4, $vgpr8_vgpr9, $sgpr0_sgpr1, $sgpr1, $sgpr8_sgpr9_sgpr10_sgpr11
-    ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact $vgpr0_vgpr1, $vgpr4, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr1, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $sgpr0_sgpr1, 8, $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_ENDPGM 0
-    BUFFER_STORE_DWORDX2_OFFEN_exact $vgpr0_vgpr1, $vgpr4, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr1, 0, 0, 0, implicit $exec
-    $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $sgpr0_sgpr1, 8, $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    S_ENDPGM 0
-...
-
----
-name: buffer_store_dwordx4_sgpr_soffset_then_unrelated_write
-tracksRegLiveness: true
-body: |
-  bb.0:
-    liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr8_vgpr9, $sgpr0_sgpr1, $sgpr1, $sgpr8_sgpr9_sgpr10_sgpr11
-    ; GCN-LABEL: name: buffer_store_dwordx4_sgpr_soffset_then_unrelated_write
-    ; GCN: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr8_vgpr9, $sgpr0_sgpr1, $sgpr1, $sgpr8_sgpr9_sgpr10_sgpr11
-    ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr1, 0, 0, 0, implicit $exec
-    ; GCN-NEXT: $vgpr10_vgpr11 = nofpexcept V_PK_MUL_F32 0, $sgpr0_sgpr1, 8, $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_ENDPGM 0
-    BUFFER_STORE_DWORDX4_OFFEN_exact $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr1, 0, 0, 0, implicit $exec
-    $vgpr10_vgpr11 = nofpexcept V_PK_MUL_F32 0, $sgpr0_sgpr1, 8, $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    S_ENDPGM 0
-...


        


More information about the llvm-branch-commits mailing list