[llvm-branch-commits] [llvm] 7f6577c - Revert "[AMDGPU] Widen MUBUF/MTBUF source-vgpr WAR hazard on gfx940-family to…"
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Sat Jun 13 08:24:52 PDT 2026
Author: theRonShark
Date: 2026-06-13T11:24:47-04:00
New Revision: 7f6577cc1acb0ab80696e2314dc69a0f7242cf25
URL: https://github.com/llvm/llvm-project/commit/7f6577cc1acb0ab80696e2314dc69a0f7242cf25
DIFF: https://github.com/llvm/llvm-project/commit/7f6577cc1acb0ab80696e2314dc69a0f7242cf25.diff
LOG: Revert "[AMDGPU] Widen MUBUF/MTBUF source-vgpr WAR hazard on gfx940-family to…"
This reverts commit 62b7cf9623fc310525f39ed69aaecc318a909731.
Added:
Modified:
llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
Removed:
llvm/test/CodeGen/AMDGPU/buffer-store-dwordx4-vpk-mul-war-hazard-gfx942.mir
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 25a7ca7d957b0..91329473fca6f 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -895,18 +895,15 @@ int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) const {
// (like wbinvl1)
if (VDataIdx == -1)
return -1;
- if (AMDGPU::getRegBitWidth(VDataRCID) > 64) {
- // On gfx940-family the BUFFER_STORE source-vgpr WAR hazard exists for
- // every SOFFSET shape; the wait-state count
diff ers by SOFFSET, and is
- // computed in checkVALUHazardsHelper. Pre-gfx940 the hazard only exists
- // if soffset is not an SGPR.
- if (ST.hasGFX940Insts())
- return VDataIdx;
- const MachineOperand *SOffset =
- TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
- if (!SOffset || !SOffset->isReg())
- return VDataIdx;
- }
+ // For MUBUF/MTBUF instructions this hazard only exists if the
+ // instruction is not using a register in the soffset field.
+ const MachineOperand *SOffset =
+ TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
+ // If we have no soffset operand, then assume this field has been
+ // hardcoded to zero.
+ if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
+ (!SOffset || !SOffset->isReg()))
+ return VDataIdx;
}
// MIMG instructions create a hazard if they don't use a 256-bit T# and
@@ -934,59 +931,25 @@ int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) const {
int GCNHazardRecognizer::checkVALUHazardsHelper(
const MachineOperand &Def, const MachineRegisterInfo &MRI) const {
- // Helper to check for the hazard where VMEM instructions that store more
- // than 8 bytes can have their store data overwritten by the next
- // instruction. On gfx940-family the window depends on the producer's
- // SOFFSET shape:
- // - MUBUF/MTBUF wide store with sgpr SOFFSET: 1 wait state.
- // - MUBUF/MTBUF wide store with literal/absent SOFFSET, and FLAT wide
- // store: 2 wait states.
- // Pre-gfx940 keeps a single 1-wait-state window. The 1-cycle sgpr-SOFFSET
- // window was measured on gfx950 (MI350X); the same gate is applied to the
- // rest of the gfx940 family to match the existing rule's granularity.
+ // Helper to check for the hazard where VMEM instructions that store more than
+ // 8 bytes can have there store data over written by the next instruction.
const SIRegisterInfo *TRI = ST.getRegisterInfo();
- const SIInstrInfo *TII = ST.getInstrInfo();
+ const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
int WaitStatesNeeded = 0;
+
if (!TRI->isVectorRegister(MRI, Def.getReg()))
return WaitStatesNeeded;
- const Register Reg = Def.getReg();
-
- const int MaxWaitStates = ST.hasGFX940Insts() ? 2 : 1;
-
- // Per-producer required wait-state window. On pre-gfx940 every producer
- // uses 1; on gfx940-family MUBUF/MTBUF stores with an SGPR SOFFSET use 1
- // and everything else (literal/absent SOFFSET, FLAT) uses 2.
- auto WindowFor = [this, TII](const MachineInstr &MI) -> int {
- if (!ST.hasGFX940Insts())
- return 1;
- if (TII->isBUF(MI)) {
- const MachineOperand *SOffset =
- TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
- if (SOffset && SOffset->isReg())
- return 1;
- }
- return 2;
- };
-
- // For each hazard producer reached, accumulate the wait states still
- // needed using that producer's own window. The predicate always returns
- // false so the walk runs to MaxWaitStates.
- int Distance = 0;
- auto Counter = [&](const MachineInstr &MI) {
+ Register Reg = Def.getReg();
+ auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
int DataIdx = createsVALUHazard(MI);
- if (DataIdx >= 0 &&
- TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg)) {
- int Need = WindowFor(MI) - Distance;
- WaitStatesNeeded = std::max(WaitStatesNeeded, Need);
- }
- // Mirror getWaitStatesSince's accounting, which does not count inline asm
- // towards the wait-state distance.
- if (!MI.isInlineAsm())
- Distance += SIInstrInfo::getNumWaitStates(MI);
- return false;
+ return DataIdx >= 0 &&
+ TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
};
- getWaitStatesSince(Counter, MaxWaitStates);
+
+ int WaitStatesNeededForDef =
+ VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
return WaitStatesNeeded;
}
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-store-dwordx4-vpk-mul-war-hazard-gfx942.mir b/llvm/test/CodeGen/AMDGPU/buffer-store-dwordx4-vpk-mul-war-hazard-gfx942.mir
deleted file mode 100644
index db8b47f5cab5f..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/buffer-store-dwordx4-vpk-mul-war-hazard-gfx942.mir
+++ /dev/null
@@ -1,122 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
-# RUN: llc -mtriple=amdgcn -mcpu=gfx950 -run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,GFX950 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,GFX942 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,GFX900 %s
-
-# Tests for the BUFFER_STORE source-vgpr WAR hazard on gfx940-family
-# subtargets (gfx942, gfx950).
-#
-# A buffer_store_dwordx4 (or any MUBUF store wider than 8 bytes) immediately
-# followed by a VALU instruction that overwrites two of the source vgprs
-# (typically v_pk_mul_f32 / v_pk_add_f32 writing v[X:X+1]) loses the bytes
-# of one dword to the post-write value on gfx940 family. The required
-# wait-state window depends on the producer's SOFFSET shape:
-# - SGPR-sourced SOFFSET: 1 wait state -> S_NOP 0
-# - literal/absent SOFFSET: 2 wait states -> S_NOP 1
-# Empirical measurement on gfx950 (MI350X); the same rule is applied to
-# the rest of the gfx940 family to match the existing recognizer's
-# granularity.
-#
-# Pre-gfx940 the hazard existed only with literal/absent SOFFSET; the
-# pre-existing rule keeps its narrow gating for those targets.
-
----
-name: buffer_store_dwordx4_sgpr_soffset_then_vpk_mul
-tracksRegLiveness: true
-body: |
- bb.0:
- liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr8_vgpr9, $sgpr0_sgpr1, $sgpr1, $sgpr8_sgpr9_sgpr10_sgpr11
- ; GFX950-LABEL: name: buffer_store_dwordx4_sgpr_soffset_then_vpk_mul
- ; GFX950: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr8_vgpr9, $sgpr0_sgpr1, $sgpr1, $sgpr8_sgpr9_sgpr10_sgpr11
- ; GFX950-NEXT: {{ $}}
- ; GFX950-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr1, 0, 0, 0, implicit $exec
- ; GFX950-NEXT: S_NOP 0
- ; GFX950-NEXT: $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $sgpr0_sgpr1, 8, $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX950-NEXT: S_ENDPGM 0
- ;
- ; GFX942-LABEL: name: buffer_store_dwordx4_sgpr_soffset_then_vpk_mul
- ; GFX942: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr8_vgpr9, $sgpr0_sgpr1, $sgpr1, $sgpr8_sgpr9_sgpr10_sgpr11
- ; GFX942-NEXT: {{ $}}
- ; GFX942-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr1, 0, 0, 0, implicit $exec
- ; GFX942-NEXT: S_NOP 0
- ; GFX942-NEXT: $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $sgpr0_sgpr1, 8, $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX942-NEXT: S_ENDPGM 0
- ;
- ; GFX900-LABEL: name: buffer_store_dwordx4_sgpr_soffset_then_vpk_mul
- ; GFX900: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr8_vgpr9, $sgpr0_sgpr1, $sgpr1, $sgpr8_sgpr9_sgpr10_sgpr11
- ; GFX900-NEXT: {{ $}}
- ; GFX900-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr1, 0, 0, 0, implicit $exec
- ; GFX900-NEXT: $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $sgpr0_sgpr1, 8, $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX900-NEXT: S_ENDPGM 0
- BUFFER_STORE_DWORDX4_OFFEN_exact $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr1, 0, 0, 0, implicit $exec
- $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $sgpr0_sgpr1, 8, $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- S_ENDPGM 0
-...
-
----
-name: buffer_store_dwordx4_literal_soffset_then_vpk_mul
-tracksRegLiveness: true
-body: |
- bb.0:
- liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr8_vgpr9, $sgpr0_sgpr1, $sgpr8_sgpr9_sgpr10_sgpr11
- ; GFX950-LABEL: name: buffer_store_dwordx4_literal_soffset_then_vpk_mul
- ; GFX950: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr8_vgpr9, $sgpr0_sgpr1, $sgpr8_sgpr9_sgpr10_sgpr11
- ; GFX950-NEXT: {{ $}}
- ; GFX950-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec
- ; GFX950-NEXT: S_NOP 1
- ; GFX950-NEXT: $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $sgpr0_sgpr1, 8, $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX950-NEXT: S_ENDPGM 0
- ;
- ; GFX942-LABEL: name: buffer_store_dwordx4_literal_soffset_then_vpk_mul
- ; GFX942: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr8_vgpr9, $sgpr0_sgpr1, $sgpr8_sgpr9_sgpr10_sgpr11
- ; GFX942-NEXT: {{ $}}
- ; GFX942-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec
- ; GFX942-NEXT: S_NOP 1
- ; GFX942-NEXT: $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $sgpr0_sgpr1, 8, $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX942-NEXT: S_ENDPGM 0
- ;
- ; GFX900-LABEL: name: buffer_store_dwordx4_literal_soffset_then_vpk_mul
- ; GFX900: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr8_vgpr9, $sgpr0_sgpr1, $sgpr8_sgpr9_sgpr10_sgpr11
- ; GFX900-NEXT: {{ $}}
- ; GFX900-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec
- ; GFX900-NEXT: S_NOP 0
- ; GFX900-NEXT: $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $sgpr0_sgpr1, 8, $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX900-NEXT: S_ENDPGM 0
- BUFFER_STORE_DWORDX4_OFFEN_exact $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec
- $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $sgpr0_sgpr1, 8, $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- S_ENDPGM 0
-...
-
----
-name: buffer_store_dwordx2_sgpr_soffset_then_vpk_mul_no_hazard
-tracksRegLiveness: true
-body: |
- bb.0:
- liveins: $vgpr0_vgpr1, $vgpr4, $vgpr8_vgpr9, $sgpr0_sgpr1, $sgpr1, $sgpr8_sgpr9_sgpr10_sgpr11
- ; GCN-LABEL: name: buffer_store_dwordx2_sgpr_soffset_then_vpk_mul_no_hazard
- ; GCN: liveins: $vgpr0_vgpr1, $vgpr4, $vgpr8_vgpr9, $sgpr0_sgpr1, $sgpr1, $sgpr8_sgpr9_sgpr10_sgpr11
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact $vgpr0_vgpr1, $vgpr4, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr1, 0, 0, 0, implicit $exec
- ; GCN-NEXT: $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $sgpr0_sgpr1, 8, $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: S_ENDPGM 0
- BUFFER_STORE_DWORDX2_OFFEN_exact $vgpr0_vgpr1, $vgpr4, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr1, 0, 0, 0, implicit $exec
- $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $sgpr0_sgpr1, 8, $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- S_ENDPGM 0
-...
-
----
-name: buffer_store_dwordx4_sgpr_soffset_then_unrelated_write
-tracksRegLiveness: true
-body: |
- bb.0:
- liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr8_vgpr9, $sgpr0_sgpr1, $sgpr1, $sgpr8_sgpr9_sgpr10_sgpr11
- ; GCN-LABEL: name: buffer_store_dwordx4_sgpr_soffset_then_unrelated_write
- ; GCN: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr8_vgpr9, $sgpr0_sgpr1, $sgpr1, $sgpr8_sgpr9_sgpr10_sgpr11
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr1, 0, 0, 0, implicit $exec
- ; GCN-NEXT: $vgpr10_vgpr11 = nofpexcept V_PK_MUL_F32 0, $sgpr0_sgpr1, 8, $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: S_ENDPGM 0
- BUFFER_STORE_DWORDX4_OFFEN_exact $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr1, 0, 0, 0, implicit $exec
- $vgpr10_vgpr11 = nofpexcept V_PK_MUL_F32 0, $sgpr0_sgpr1, 8, $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- S_ENDPGM 0
-...
More information about the llvm-branch-commits
mailing list