[llvm] 1813603 - [AMDGPU] Fix GFX1250 hazard: S_SET_VGPR_MSB dropped (#184904)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 10 06:35:52 PDT 2026
Author: Yaxun (Sam) Liu
Date: 2026-03-10T09:35:45-04:00
New Revision: 1813603933d19111e5310c5302b76be4de06dcab
URL: https://github.com/llvm/llvm-project/commit/1813603933d19111e5310c5302b76be4de06dcab
DIFF: https://github.com/llvm/llvm-project/commit/1813603933d19111e5310c5302b76be4de06dcab.diff
LOG: [AMDGPU] Fix GFX1250 hazard: S_SET_VGPR_MSB dropped (#184904)
[AMDGPU] Fix GFX1250 hazard: S_SET_VGPR_MSB dropped after
S_SETREG_IMM32_B32 (MODE)
On GFX1250, S_SET_VGPR_MSB immediately after S_SETREG_IMM32_B32
targeting
the MODE register is silently dropped by hardware.
AMDGPULowerVGPREncoding may insert S_SET_VGPR_MSB after a setreg(MODE)
in
Case 2 (size > 12) when imm32[12:19] doesn't match current VGPR MSBs, or
when the next VALU instruction needs different MSBs. Fix by inserting
S_NOP
between the setreg and S_SET_VGPR_MSB to prevent the hazard.
The fix handles two scenarios:
- Case 2 mismatch: S_NOP is inserted directly before S_SET_VGPR_MSB in
handleSetregMode.
- Case 2 match followed by a VALU with different MSBs: a flag
(NeedNopBeforeSetVGPRMSB) is set, and setMode inserts S_NOP before the
next S_SET_VGPR_MSB.
Also adds vcmpx-permlane-vgpr-msb-gfx1250.mir to verify that VGPR
lowering
must run after the hazard recognizer: fixVcmpxPermlaneHazards creates
V_MOV_B32 using high VGPRs that need correct S_SET_VGPR_MSB from the
lowering pass.
Added:
llvm/test/CodeGen/AMDGPU/hazard-setreg-vgpr-msb-gfx1250.mir
llvm/test/CodeGen/AMDGPU/vcmpx-permlane-vgpr-msb-gfx1250.mir
Modified:
llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
llvm/test/CodeGen/AMDGPU/vgpr-setreg-mode-swar.mir
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
index cbbc90a0f25b0..f18cc5f59ac07 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
@@ -155,6 +155,11 @@ class AMDGPULowerVGPREncoding {
/// Last hard clause instruction.
MachineInstr *Clause;
+ /// S_SET_VGPR_MSB immediately after S_SETREG_IMM32_B32 targeting MODE is
+ /// silently dropped on GFX1250. When set, the next S_SET_VGPR_MSB insertion
+ /// must be preceded by S_NOP to avoid the hazard.
+ bool NeedNopBeforeSetVGPRMSB;
+
/// Insert mode change before \p I. \returns true if mode was changed.
bool setMode(ModeTy NewMode, MachineBasicBlock::instr_iterator I);
@@ -251,6 +256,14 @@ bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode,
I = handleClause(I);
I = handleCoissue(I);
+ // Case 2 match in handleSetregMode: the setreg's imm[12:19] matched
+ // current MSBs, but the next VALU needs
diff erent MSBs, so this
+ // S_SET_VGPR_MSB would land right after the setreg. Insert S_NOP to
+ // prevent it from being silently dropped.
+ if (NeedNopBeforeSetVGPRMSB) {
+ BuildMI(*MBB, I, {}, TII->get(AMDGPU::S_NOP)).addImm(0);
+ NeedNopBeforeSetVGPRMSB = false;
+ }
MostRecentModeSet = BuildMI(*MBB, I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB))
.addImm(NewMode.encode() | OldModeBits);
LLVM_DEBUG(dbgs() << " -> inserted new S_SET_VGPR_MSB: "
@@ -508,14 +521,18 @@ bool AMDGPULowerVGPREncoding::handleSetregMode(MachineInstr &MI) {
// via piggybacking (bits[12:19] are meaningful), so if CurrentMode changes,
// a new s_set_vgpr_msb will be inserted after this instruction.
MostRecentModeSet = nullptr;
+ NeedNopBeforeSetVGPRMSB = true;
LLVM_DEBUG(dbgs() << " -> bits[12:19] already correct, "
"invalidated MostRecentModeSet\n");
return false;
}
// imm32[12:19] doesn't match VGPR MSBs - insert s_set_vgpr_msb after
- // the original instruction to restore the correct value.
+ // the original instruction to restore the correct value. Insert S_NOP
+ // to avoid the GFX1250 hazard where S_SET_VGPR_MSB immediately after
+ // S_SETREG_IMM32_B32(MODE) is silently dropped.
MachineBasicBlock::iterator InsertPt = std::next(MI.getIterator());
+ BuildMI(*MBB, InsertPt, MI.getDebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
MostRecentModeSet = BuildMI(*MBB, InsertPt, MI.getDebugLoc(),
TII->get(AMDGPU::S_SET_VGPR_MSB))
.addImm(ModeValue);
@@ -540,6 +557,7 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
CurrentMode = {};
for (auto &MBB : MF) {
MostRecentModeSet = nullptr;
+ NeedNopBeforeSetVGPRMSB = false;
this->MBB = &MBB;
LLVM_DEBUG(dbgs() << "BB#" << MBB.getNumber() << ' ' << MBB.getName()
@@ -556,6 +574,7 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
CurrentMode = {};
else
resetMode(MI.getIterator());
+ NeedNopBeforeSetVGPRMSB = false;
continue;
}
@@ -563,6 +582,7 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << " inline asm: " << MI);
if (TII->hasVGPRUses(MI))
resetMode(MI.getIterator());
+ NeedNopBeforeSetVGPRMSB = false;
continue;
}
@@ -584,6 +604,7 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
}
Changed |= runOnMachineInstr(MI);
+ NeedNopBeforeSetVGPRMSB = false;
if (ClauseRemaining)
--ClauseRemaining;
diff --git a/llvm/test/CodeGen/AMDGPU/hazard-setreg-vgpr-msb-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/hazard-setreg-vgpr-msb-gfx1250.mir
new file mode 100644
index 0000000000000..95c8c67566f51
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hazard-setreg-vgpr-msb-gfx1250.mir
@@ -0,0 +1,119 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=amdgpu-lower-vgpr-encoding -o - %s | FileCheck %s
+
+# Test handling of the GFX1250 hardware hazard where S_SET_VGPR_MSB immediately
+# after S_SETREG_IMM32_B32 (MODE) is silently dropped.
+#
+# AMDGPULowerVGPREncoding may place S_SET_VGPR_MSB after S_SETREG_IMM32_B32
+# (MODE) in Case 2 (size > 12). It inserts S_NOPs between them to prevent
+# the hazard.
+
+---
+# Case 2 mismatch: setreg (size=16) with imm32[12:19] that doesn't match
+# current VGPR MSB. AMDGPULowerVGPREncoding inserts S_NOP + S_SET_VGPR_MSB
+# after the setreg.
+name: setreg_mode_size_gt_12_mismatch
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: setreg_mode_size_gt_12_mismatch
+ ; CHECK: S_SET_VGPR_MSB 64, implicit-def $mode
+ ; CHECK-NEXT: $vgpr256 = V_MOV_B32_e32 undef $sgpr0, implicit $exec
+ ; CHECK-NEXT: S_SETREG_IMM32_B32 146108, 30721, implicit-def $mode, implicit $mode
+ ; CHECK-NEXT: S_NOP 0
+ ; CHECK-NEXT: S_SET_VGPR_MSB 64, implicit-def $mode
+ ; CHECK-NEXT: S_ENDPGM 0
+ $vgpr256 = V_MOV_B32_e32 undef $sgpr0, implicit $exec
+ ; hwreg(MODE, 0, 16): simm16 = 0x7801 = 30721
+ ; imm32 = 0x23ABC = 146108 (bits 12:19 = 0x23, doesn't match VGPR MSB mode)
+ S_SETREG_IMM32_B32 146108, 30721, implicit-def $mode, implicit $mode
+ S_ENDPGM 0
+...
+
+---
+# Case 2 with
diff erent next MSB: setreg (size=16) with imm32[12:19] that
+# doesn't match current VGPR MSB. S_NOP + S_SET_VGPR_MSB is inserted to
+# restore current mode, then another S_SET_VGPR_MSB for the next VALU
+# (v512/v513).
+name: setreg_mode_size_gt_12_matches_next
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: setreg_mode_size_gt_12_matches_next
+ ; CHECK: S_SET_VGPR_MSB 65, implicit-def $mode
+ ; CHECK-NEXT: $vgpr256 = V_MOV_B32_e32 undef $vgpr257, implicit $exec
+ ; CHECK-NEXT: S_SETREG_IMM32_B32 43708, 30721, implicit-def $mode, implicit $mode
+ ; CHECK-NEXT: S_NOP 0
+ ; CHECK-NEXT: S_SET_VGPR_MSB 65, implicit-def $mode
+ ; CHECK-NEXT: S_SET_VGPR_MSB 16770, implicit-def $mode
+ ; CHECK-NEXT: $vgpr512 = V_MOV_B32_e32 undef $vgpr513, implicit $exec
+ ; CHECK-NEXT: S_ENDPGM 0
+ $vgpr256 = V_MOV_B32_e32 undef $vgpr257, implicit $exec
+ ; hwreg(MODE, 0, 16): simm16 = 0x7801 = 30721
+ ; imm32 = 0xAABC = 43708 (bits 12:19 = 0xA = 10, matches next MSB for v512/v513)
+ S_SETREG_IMM32_B32 43708, 30721, implicit-def $mode, implicit $mode
+ $vgpr512 = V_MOV_B32_e32 undef $vgpr513, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+# No hazard: S_SETREG_IMM32_B32 targeting non-MODE register.
+name: setreg_non_mode_no_hazard
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: setreg_non_mode_no_hazard
+ ; CHECK: S_SET_VGPR_MSB 64, implicit-def $mode
+ ; CHECK-NEXT: $vgpr256 = V_MOV_B32_e32 undef $sgpr0, implicit $exec
+ ; CHECK-NEXT: S_SETREG_IMM32_B32 0, 2178, implicit-def $mode, implicit $mode
+ ; CHECK-NEXT: S_SET_VGPR_MSB 16384, implicit-def $mode
+ ; CHECK-NEXT: $vgpr0 = V_ADD_F32_e32 undef $vgpr1, undef $vgpr2, implicit $exec, implicit $mode
+ ; CHECK-NEXT: S_ENDPGM 0
+ $vgpr256 = V_MOV_B32_e32 undef $sgpr0, implicit $exec
+ ; hwreg(STATUS, 2, 2): simm16 = 2 | (2 << 6) | (1 << 11) = 0x882 = 2178
+ S_SETREG_IMM32_B32 0, 2178, implicit-def $mode, implicit $mode
+ $vgpr0 = V_ADD_F32_e32 undef $vgpr1, undef $vgpr2, implicit $exec, implicit $mode
+ S_ENDPGM 0
+...
+
+---
+# Case 2 but no high VGPRs before setreg. The lowering pass still inserts
+# S_NOP + S_SET_VGPR_MSB 0 (redundant but safe).
+name: setreg_mode_size_gt_12_no_high_vgpr
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: setreg_mode_size_gt_12_no_high_vgpr
+ ; CHECK: $vgpr0 = V_MOV_B32_e32 undef $sgpr0, implicit $exec
+ ; CHECK-NEXT: S_SETREG_IMM32_B32 146108, 30721, implicit-def $mode, implicit $mode
+ ; CHECK-NEXT: S_NOP 0
+ ; CHECK-NEXT: S_SET_VGPR_MSB 0, implicit-def $mode
+ ; CHECK-NEXT: S_ENDPGM 0
+ $vgpr0 = V_MOV_B32_e32 undef $sgpr0, implicit $exec
+ ; hwreg(MODE, 0, 16): simm16 = 0x7801 = 30721
+ S_SETREG_IMM32_B32 146108, 30721, implicit-def $mode, implicit $mode
+ S_ENDPGM 0
+...
+
+---
+# Case 2 with high VGPR only AFTER setreg: setreg (size=16) with low VGPRs
+# before but high VGPRs after. AMDGPULowerVGPREncoding inserts S_NOP +
+# S_SET_VGPR_MSB before the next VALU (which is right after setreg).
+name: setreg_mode_size_gt_12_high_vgpr_after
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: setreg_mode_size_gt_12_high_vgpr_after
+ ; CHECK: $vgpr0 = V_MOV_B32_e32 undef $sgpr0, implicit $exec
+ ; CHECK-NEXT: S_SETREG_IMM32_B32 146108, 30721, implicit-def $mode, implicit $mode
+ ; CHECK-NEXT: S_NOP 0
+ ; CHECK-NEXT: S_SET_VGPR_MSB 0, implicit-def $mode
+ ; CHECK-NEXT: S_SET_VGPR_MSB 64, implicit-def $mode
+ ; CHECK-NEXT: $vgpr256 = V_MOV_B32_e32 undef $sgpr0, implicit $exec
+ ; CHECK-NEXT: S_ENDPGM 0
+ $vgpr0 = V_MOV_B32_e32 undef $sgpr0, implicit $exec
+ ; hwreg(MODE, 0, 16): simm16 = 0x7801 = 30721
+ S_SETREG_IMM32_B32 146108, 30721, implicit-def $mode, implicit $mode
+ $vgpr256 = V_MOV_B32_e32 undef $sgpr0, implicit $exec
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-vgpr-msb-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-vgpr-msb-gfx1250.mir
new file mode 100644
index 0000000000000..8be4c6449d596
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-vgpr-msb-gfx1250.mir
@@ -0,0 +1,69 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# Test: fixVcmpxPermlaneHazards + S_SET_VGPR_MSB interaction on GFX1250
+#
+# Proves that VGPR lowering MUST run AFTER the hazard recognizer.
+#
+# fixVcmpxPermlaneHazards creates V_MOV_B32 Reg, Reg using permlane's src0.
+# When src0 is a high VGPR (>= v256) but vdst is a low VGPR, the V_MOV_B32
+# needs {dst=1, src0=1} MSB mode, while the permlane needs {dst=0, src0=1}.
+#
+# Current order (hazard-rec then vgpr-lowering): VGPR lowering sees the
+# V_MOV_B32 and inserts S_SET_VGPR_MSB with correct {dst=1, src0=1}.
+#
+# Reversed order (vgpr-lowering then hazard-rec): The V_MOV_B32 inherits the
+# permlane's {dst=0, src0=1}, causing it to write to v44 instead of v300.
+
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=post-RA-hazard-rec,amdgpu-lower-vgpr-encoding -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -start-before=post-RA-hazard-rec -o - %s | FileCheck -check-prefix=ASM %s
+
+# vcmpx + permlane with high VGPR src0 ($vgpr300) and low VGPR vdst ($vgpr0).
+# fixVcmpxPermlaneHazards inserts V_MOV_B32 $vgpr300, $vgpr300 before permlane.
+#
+# With correct order (hazard-rec then vgpr-lowering), VGPR lowering sees the
+# V_MOV_B32 and inserts S_SET_VGPR_MSB 65 {dst=1, src0=1} before it,
+# then S_SET_VGPR_MSB {dst=0, src0=1} before the permlane.
+#
+# If the order were reversed (vgpr-lowering then hazard-rec), VGPR lowering
+# would only see the permlane and insert S_SET_VGPR_MSB 1 {dst=0, src0=1}.
+# The V_MOV_B32 inserted later by the hazard recognizer would inherit dst=0,
+# writing to v44 (300 & 0xFF) instead of v300.
+#
+# ASM-LABEL: vcmpx_permlane_high_vgpr_src0:
+# ASM: v_cmpx_le_f32_e32 0, v0
+# ASM: s_set_vgpr_msb 0x41
+# ASM-NEXT: v_mov_b32_e32 v44 /*v300*/, v44 /*v300*/
+# ASM-NEXT: s_set_vgpr_msb 0x4101
+# ASM: v_permlane16_b32 v0, v44 /*v300*/, s1, s0
+---
+name: vcmpx_permlane_high_vgpr_src0
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: vcmpx_permlane_high_vgpr_src0
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: liveins: $vgpr0, $vgpr300, $sgpr0, $sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_SET_VGPR_MSB 65, implicit-def $mode
+ ; CHECK-NEXT: $vgpr300 = V_MOV_B32_e32 killed $vgpr300, implicit $exec
+ ; CHECK-NEXT: S_SET_VGPR_MSB 16641, implicit-def $mode
+ ; CHECK-NEXT: $vgpr0 = V_PERMLANE16_B32_e64 0, killed $vgpr300, 0, killed $sgpr1, 0, killed $sgpr0, $vgpr0, 0, implicit $exec
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1
+ liveins: $vgpr0
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ liveins: $vgpr0, $vgpr300, $sgpr0, $sgpr1
+ $vgpr0 = V_PERMLANE16_B32_e64 0, killed $vgpr300, 0, killed $sgpr1, 0, killed $sgpr0, $vgpr0, 0, implicit $exec
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-setreg-mode-swar.mir b/llvm/test/CodeGen/AMDGPU/vgpr-setreg-mode-swar.mir
index ecfc3cdcd215c..422baae280f9c 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-setreg-mode-swar.mir
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-setreg-mode-swar.mir
@@ -94,6 +94,7 @@ body: |
; CHECK: S_SET_VGPR_MSB 65, implicit-def $mode
; CHECK-NEXT: $vgpr256 = V_MOV_B32_e32 $vgpr257, implicit $exec
; CHECK-NEXT: S_SETREG_IMM32_B32 146108, 30721, implicit-def $mode, implicit $mode
+ ; CHECK-NEXT: S_NOP 0
; CHECK-NEXT: S_SET_VGPR_MSB 65, implicit-def $mode
; CHECK-NEXT: S_ENDPGM 0
$vgpr256 = V_MOV_B32_e32 $vgpr257, implicit $exec
@@ -231,6 +232,7 @@ body: |
; CHECK: S_SET_VGPR_MSB 65, implicit-def $mode
; CHECK-NEXT: $vgpr256 = V_MOV_B32_e32 $vgpr257, implicit $exec
; CHECK-NEXT: S_SETREG_IMM32_B32 23228, 30721, implicit-def $mode, implicit $mode
+ ; CHECK-NEXT: S_NOP 0
; CHECK-NEXT: S_SET_VGPR_MSB 16770, implicit-def $mode
; CHECK-NEXT: $vgpr512 = V_MOV_B32_e32 $vgpr513, implicit $exec
; CHECK-NEXT: S_ENDPGM 0
More information about the llvm-commits
mailing list