[llvm] [AMDGPU] Fix GFX1250 hazard: S_SET_VGPR_MSB dropped (PR #184904)

Thu Mar 5 14:59:44 PST 2026

https://github.com/yxsamliu updated https://github.com/llvm/llvm-project/pull/184904

>From 462d4e5347b553e64f94bebf72fdd1671ae384f9 Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu at amd.com>
Date: Wed, 4 Mar 2026 19:42:17 -0500
Subject: [PATCH] [AMDGPU] Fix GFX1250 hazard: S_SET_VGPR_MSB dropped after
 S_SETREG_IMM32_B32 (MODE)

On GFX1250, S_SET_VGPR_MSB is silently dropped when immediately following
S_SETREG_IMM32_B32 targeting the MODE register.

For Case 2 (size > 12), where imm32[12:19] is part of the MODE value and
cannot be freely modified, GCNHazardRecognizer predicts whether
AMDGPULowerVGPREncoding will place S_SET_VGPR_MSB after the setreg and
inserts S_NOPs to prevent the hazard. AMDGPULowerVGPREncoding then skips
over these S_NOPs when placing S_SET_VGPR_MSB.

The prediction (willSetregNeedVGPRMSB) mirrors handleSetregMode's logic:
- If imm[12:19] matches the next VALU's MSB, no S_SET_VGPR_MSB is needed
- If no preceding high VGPRs are in use, handleSetregMode won't insert
- Otherwise, S_NOPs are inserted to separate the instructions

Shared MSB computation utilities (OpMode, ModeTy, computeMode, etc.) are
extracted into AMDGPUVGPREncoding.h for use by both passes.

The number of S_NOPs is configurable via -amdgpu-setreg-vgpr-msb-nops
(default: 1). A debug assertion verifies no back-to-back setreg +
S_SET_VGPR_MSB remains after lowering.
---
 .../Target/AMDGPU/AMDGPULowerVGPREncoding.cpp | 216 +++++----------
 llvm/lib/Target/AMDGPU/AMDGPUVGPREncoding.h   | 253 ++++++++++++++++++
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp |  34 +++
 llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h  |   1 +
 .../AMDGPU/hazard-setreg-vgpr-msb-gfx1250.mir | 117 ++++++++
 .../CodeGen/AMDGPU/vgpr-setreg-mode-swar.mir  |   4 +-
 6 files changed, 475 insertions(+), 150 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUVGPREncoding.h
 create mode 100644 llvm/test/CodeGen/AMDGPU/hazard-setreg-vgpr-msb-gfx1250.mir

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
index 3aa4f1893698a..822e70c623a43 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
@@ -42,10 +42,12 @@
 
 #include "AMDGPULowerVGPREncoding.h"
 #include "AMDGPU.h"
+#include "AMDGPUVGPREncoding.h"
 #include "GCNSubtarget.h"
 #include "SIDefines.h"
 #include "SIInstrInfo.h"
 #include "llvm/ADT/bit.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 
 using namespace llvm;
@@ -54,63 +56,9 @@ using namespace llvm;
 
 namespace {
 
-class AMDGPULowerVGPREncoding {
-  static constexpr unsigned OpNum = 4;
-  static constexpr unsigned BitsPerField = 2;
-  static constexpr unsigned NumFields = 4;
-  static constexpr unsigned ModeWidth = NumFields * BitsPerField;
-  static constexpr unsigned ModeMask = (1 << ModeWidth) - 1;
-  static constexpr unsigned VGPRMSBShift =
-      llvm::countr_zero_constexpr<unsigned>(AMDGPU::Hwreg::DST_VGPR_MSB);
-
-  struct OpMode {
-    // No MSBs set means they are not required to be of a particular value.
-    std::optional<unsigned> MSBits;
-
-    bool update(const OpMode &New, bool &Rewritten) {
-      bool Updated = false;
-      if (New.MSBits) {
-        if (*New.MSBits != MSBits.value_or(0)) {
-          Updated = true;
-          Rewritten |= MSBits.has_value();
-        }
-        MSBits = New.MSBits;
-      }
-      return Updated;
-    }
-  };
-
-  struct ModeTy {
-    OpMode Ops[OpNum];
-
-    bool update(const ModeTy &New, bool &Rewritten) {
-      bool Updated = false;
-      for (unsigned I : seq(OpNum))
-        Updated |= Ops[I].update(New.Ops[I], Rewritten);
-      return Updated;
-    }
-
-    unsigned encode() const {
-      // Layout: [src0 msb, src1 msb, src2 msb, dst msb].
-      unsigned V = 0;
-      for (const auto &[I, Op] : enumerate(Ops))
-        V |= Op.MSBits.value_or(0) << (I * 2);
-      return V;
-    }
-
-    // Check if this mode is compatible with required \p NewMode without
-    // modification.
-    bool isCompatible(const ModeTy NewMode) const {
-      for (unsigned I : seq(OpNum)) {
-        if (!NewMode.Ops[I].MSBits.has_value())
-          continue;
-        if (Ops[I].MSBits.value_or(0) != NewMode.Ops[I].MSBits.value_or(0))
-          return false;
-      }
-      return true;
-    }
-  };
+using namespace AMDGPU::VGPREncoding;
 
+class AMDGPULowerVGPREncoding {
 public:
   bool run(MachineFunction &MF);
 
@@ -150,20 +98,9 @@ class AMDGPULowerVGPREncoding {
     setMode(Mode, I);
   }
 
-  /// If \p MO references VGPRs, return the MSBs. Otherwise, return nullopt.
-  std::optional<unsigned> getMSBs(const MachineOperand &MO) const;
-
   /// Handle single \p MI. \return true if changed.
   bool runOnMachineInstr(MachineInstr &MI);
 
-  /// Compute the mode for a single \p MI given \p Ops operands
-  /// bit mapping. Optionally takes second array \p Ops2 for VOPD.
-  /// If provided and an operand from \p Ops is not a VGPR, then \p Ops2
-  /// is checked.
-  void computeMode(ModeTy &NewMode, const MachineInstr &MI,
-                   const AMDGPU::OpName Ops[OpNum],
-                   const AMDGPU::OpName *Ops2 = nullptr);
-
   /// Check if an instruction \p I is within a clause and returns a suitable
   /// iterator to insert mode change. It may also modify the S_CLAUSE
   /// instruction to extend it or drop the clause if it cannot be adjusted.
@@ -182,6 +119,10 @@ class AMDGPULowerVGPREncoding {
   /// new one was inserted.
   bool handleSetregMode(MachineInstr &MI);
 
+  /// Set CurrentMode from an encoded mode value (as returned by
+  /// ModeTy::encode / computeNextMSB).
+  void setCurrentModeFromEncoded(int64_t Encoded);
+
   /// Update bits[12:19] of the imm operand in S_SETREG_IMM32_B32 to contain
   /// the VGPR MSB mode value. \returns true if the immediate was changed.
   bool updateSetregModeImm(MachineInstr &MI, int64_t ModeValue);
@@ -222,76 +163,15 @@ bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode,
   return true;
 }
 
-std::optional<unsigned>
-AMDGPULowerVGPREncoding::getMSBs(const MachineOperand &MO) const {
-  if (!MO.isReg())
-    return std::nullopt;
-
-  MCRegister Reg = MO.getReg();
-  const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
-  if (!RC || !TRI->isVGPRClass(RC))
-    return std::nullopt;
-
-  unsigned Idx = TRI->getHWRegIndex(Reg);
-  return Idx >> 8;
-}
-
-void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode,
-                                          const MachineInstr &MI,
-                                          const AMDGPU::OpName Ops[OpNum],
-                                          const AMDGPU::OpName *Ops2) {
-  NewMode = {};
-
-  for (unsigned I = 0; I < OpNum; ++I) {
-    const MachineOperand *Op = TII->getNamedOperand(MI, Ops[I]);
-
-    std::optional<unsigned> MSBits;
-    if (Op)
-      MSBits = getMSBs(*Op);
-
-#if !defined(NDEBUG)
-    if (MSBits.has_value() && Ops2) {
-      const MachineOperand *Op2 = TII->getNamedOperand(MI, Ops2[I]);
-      if (Op2) {
-        std::optional<unsigned> MSBits2;
-        MSBits2 = getMSBs(*Op2);
-        if (MSBits2.has_value() && MSBits != MSBits2)
-          llvm_unreachable("Invalid VOPD pair was created");
-      }
-    }
-#endif
-
-    if (!MSBits.has_value() && Ops2) {
-      Op = TII->getNamedOperand(MI, Ops2[I]);
-      if (Op)
-        MSBits = getMSBs(*Op);
-    }
-
-    if (!MSBits.has_value())
-      continue;
-
-    // Skip tied uses of src2 of VOP2, these will be handled along with defs and
-    // only vdst bit affects these operands. We cannot skip tied uses of VOP3,
-    // these uses are real even if must match the vdst.
-    if (Ops[I] == AMDGPU::OpName::src2 && !Op->isDef() && Op->isTied() &&
-        (SIInstrInfo::isVOP2(MI) ||
-         (SIInstrInfo::isVOP3(MI) &&
-          TII->hasVALU32BitEncoding(MI.getOpcode()))))
-      continue;
-
-    NewMode.Ops[I].MSBits = MSBits.value();
-  }
-}
-
 bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) {
   auto Ops = AMDGPU::getVGPRLoweringOperandTables(MI.getDesc());
   if (Ops.first) {
     ModeTy NewMode;
-    computeMode(NewMode, MI, Ops.first, Ops.second);
+    computeMode(NewMode, MI, Ops.first, Ops.second, *TII, *TRI);
     if (!CurrentMode.isCompatible(NewMode) && MI.isCommutable() &&
         TII->commuteInstruction(MI)) {
       ModeTy NewModeCommuted;
-      computeMode(NewModeCommuted, MI, Ops.first, Ops.second);
+      computeMode(NewModeCommuted, MI, Ops.first, Ops.second, *TII, *TRI);
       if (CurrentMode.isCompatible(NewModeCommuted)) {
         // Update CurrentMode with mode bits the commuted instruction relies on.
         // This prevents later instructions from piggybacking and corrupting
@@ -369,15 +249,6 @@ AMDGPULowerVGPREncoding::handleCoissue(MachineBasicBlock::instr_iterator I) {
   return I;
 }
 
-/// Convert mode value from S_SET_VGPR_MSB format to MODE register format.
-/// S_SET_VGPR_MSB uses: (src0[0-1], src1[2-3], src2[4-5], dst[6-7])
-/// MODE register uses:  (dst[0-1], src0[2-3], src1[4-5], src2[6-7])
-/// This is a left rotation by 2 bits on an 8-bit value.
-static int64_t convertModeToSetregFormat(int64_t Mode) {
-  assert(isUInt<8>(Mode) && "Mode expected to be 8-bit");
-  return llvm::rotl<uint8_t>(static_cast<uint8_t>(Mode), /*R=*/2);
-}
-
 bool AMDGPULowerVGPREncoding::updateSetregModeImm(MachineInstr &MI,
                                                   int64_t ModeValue) {
   assert(MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32);
@@ -423,30 +294,54 @@ bool AMDGPULowerVGPREncoding::handleSetregMode(MachineInstr &MI) {
 
   // Case 2: Size > 12 - the original instruction uses bits beyond 11, so we
   // cannot arbitrarily modify imm32[12:19]. Check if it already matches VGPR
-  // MSBs. Note: imm32[12:19] is in MODE register format, while ModeValue is
-  // in S_SET_VGPR_MSB format, so we need to convert before comparing.
+  // MSBs (current or next). Note: imm32[12:19] is in MODE register format,
+  // while ModeValue is in S_SET_VGPR_MSB format, so we need to convert before
+  // comparing.
   MachineOperand *ImmOp = TII->getNamedOperand(MI, AMDGPU::OpName::imm);
   assert(ImmOp && "ImmOp must be present");
   int64_t ImmBits12To19 = (ImmOp->getImm() & VGPR_MSB_MASK) >> VGPRMSBShift;
   int64_t SetregModeValue = convertModeToSetregFormat(ModeValue);
-  if (ImmBits12To19 == SetregModeValue) {
-    // Already correct, but we must invalidate MostRecentModeSet because this
-    // instruction will overwrite mode[12:19]. We can't update this instruction
-    // via piggybacking (bits[12:19] are meaningful), so if CurrentMode changes,
-    // a new s_set_vgpr_msb will be inserted after this instruction.
+  if (ModeValue == 0 || ImmBits12To19 == SetregModeValue) {
+    LLVM_DEBUG(dbgs() << "  Setreg MODE (size=" << Size
+                      << "): imm[12:19] matches current MSB " << ModeValue
+                      << "\n");
     MostRecentModeSet = nullptr;
     return false;
   }
 
-  // imm32[12:19] doesn't match VGPR MSBs - insert s_set_vgpr_msb after
-  // the original instruction to restore the correct value.
-  MachineBasicBlock::iterator InsertPt = std::next(MI.getIterator());
+  // Check if imm32[12:19] matches the next instruction's MSB. If so, the
+  // setreg already sets the correct MSB for what follows - no s_set_vgpr_msb
+  // needed, avoiding the hazard entirely.
+  if (auto NextMSB = computeNextMSB(MI.getIterator(), MBB, *TII, *TRI)) {
+    int64_t NextSetregMode = convertModeToSetregFormat(*NextMSB);
+    if (ImmBits12To19 == NextSetregMode) {
+      LLVM_DEBUG(dbgs() << "  Setreg MODE (size=" << Size
+                        << "): imm[12:19] matches next MSB " << *NextMSB
+                        << "\n");
+      setCurrentModeFromEncoded(*NextMSB);
+      MostRecentModeSet = nullptr;
+      return false;
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "  Setreg MODE (size=" << Size
+                    << "): imm[12:19] mismatch, inserting "
+                       "s_set_vgpr_msb (current MSB=" << ModeValue << ")\n");
+  MachineBasicBlock::instr_iterator InsertPt = std::next(MI.getIterator());
+  while (InsertPt != MBB->instr_end() &&
+         InsertPt->getOpcode() == AMDGPU::S_NOP)
+    ++InsertPt;
   MostRecentModeSet = BuildMI(*MBB, InsertPt, MI.getDebugLoc(),
                               TII->get(AMDGPU::S_SET_VGPR_MSB))
                           .addImm(ModeValue);
   return true;
 }
 
+void AMDGPULowerVGPREncoding::setCurrentModeFromEncoded(int64_t Encoded) {
+  for (unsigned J = 0; J < OpNum; ++J)
+    CurrentMode.Ops[J].MSBits = (Encoded >> (J * BitsPerField)) & 0x3;
+}
+
 bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   if (!ST.has1024AddressableVGPRs())
@@ -506,6 +401,29 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
     resetMode(MBB.instr_end());
   }
 
+#ifndef NDEBUG
+  for (auto &MBB : MF) {
+    for (auto I = MBB.instr_begin(), E = MBB.instr_end(); I != E; ++I) {
+      if (I->getOpcode() != AMDGPU::S_SETREG_IMM32_B32)
+        continue;
+      using namespace AMDGPU::Hwreg;
+      const MachineOperand *SIMM16Op =
+          TII->getNamedOperand(*I, AMDGPU::OpName::simm16);
+      auto [HwRegId, Offset, Size] = HwregEncoding::decode(SIMM16Op->getImm());
+      (void)Offset;
+      (void)Size;
+      if (HwRegId != ID_MODE)
+        continue;
+      auto Next = std::next(I);
+      while (Next != E && Next->isMetaInstruction())
+        ++Next;
+      assert((Next == E || Next->getOpcode() != AMDGPU::S_SET_VGPR_MSB) &&
+             "s_set_vgpr_msb immediately after s_setreg_imm32_b32 (MODE) "
+             "creates a hardware hazard on GFX1250");
+    }
+  }
+#endif
+
   return Changed;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUVGPREncoding.h b/llvm/lib/Target/AMDGPU/AMDGPUVGPREncoding.h
new file mode 100644
index 0000000000000..bab88ee9baa0a
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUVGPREncoding.h
@@ -0,0 +1,253 @@
+//===--- AMDGPUVGPREncoding.h - VGPR MSB encoding utilities -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Shared utilities for VGPR MSB mode computation used by
+/// AMDGPULowerVGPREncoding and GCNHazardRecognizer.
+///
+/// On GFX1250, VGPRs above v255 require MSB mode bits to be set via
+/// S_SET_VGPR_MSB. These utilities compute the required MSB mode for
+/// a given instruction based on its VGPR operands.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUVGPRENCODING_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUVGPRENCODING_H
+
+#include "SIDefines.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/bit.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Support/MathExtras.h"
+#include <optional>
+
+namespace llvm {
+namespace AMDGPU {
+namespace VGPREncoding {
+
+static constexpr unsigned OpNum = 4;
+static constexpr unsigned BitsPerField = 2;
+static constexpr unsigned NumFields = 4;
+static constexpr unsigned ModeWidth = NumFields * BitsPerField;
+static constexpr unsigned ModeMask = (1 << ModeWidth) - 1;
+static constexpr unsigned VGPRMSBShift =
+    llvm::countr_zero_constexpr<unsigned>(AMDGPU::Hwreg::DST_VGPR_MSB);
+
+struct OpMode {
+  std::optional<unsigned> MSBits;
+
+  bool update(const OpMode &New, bool &Rewritten) {
+    bool Updated = false;
+    if (New.MSBits) {
+      if (*New.MSBits != MSBits.value_or(0)) {
+        Updated = true;
+        Rewritten |= MSBits.has_value();
+      }
+      MSBits = New.MSBits;
+    }
+    return Updated;
+  }
+};
+
+struct ModeTy {
+  OpMode Ops[OpNum];
+
+  bool update(const ModeTy &New, bool &Rewritten) {
+    bool Updated = false;
+    for (unsigned I : seq(OpNum))
+      Updated |= Ops[I].update(New.Ops[I], Rewritten);
+    return Updated;
+  }
+
+  unsigned encode() const {
+    unsigned V = 0;
+    for (const auto &[I, Op] : enumerate(Ops))
+      V |= Op.MSBits.value_or(0) << (I * 2);
+    return V;
+  }
+
+  bool isCompatible(const ModeTy NewMode) const {
+    for (unsigned I : seq(OpNum)) {
+      if (!NewMode.Ops[I].MSBits.has_value())
+        continue;
+      if (Ops[I].MSBits.value_or(0) != NewMode.Ops[I].MSBits.value_or(0))
+        return false;
+    }
+    return true;
+  }
+};
+
+/// Get the MSB bits for a register operand. Returns the high bits
+/// (index >> 8) if the operand is a VGPR, nullopt otherwise.
+inline std::optional<unsigned> getMSBs(const MachineOperand &MO,
+                                       const SIRegisterInfo &TRI) {
+  if (!MO.isReg())
+    return std::nullopt;
+
+  MCRegister Reg = MO.getReg();
+  const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(Reg);
+  if (!RC || !TRI.isVGPRClass(RC))
+    return std::nullopt;
+
+  unsigned Idx = TRI.getHWRegIndex(Reg);
+  return Idx >> 8;
+}
+
+/// Compute the VGPR MSB mode required by a single instruction.
+inline void computeMode(ModeTy &NewMode, const MachineInstr &MI,
+                        const AMDGPU::OpName Ops[OpNum],
+                        const AMDGPU::OpName *Ops2,
+                        const SIInstrInfo &TII, const SIRegisterInfo &TRI) {
+  NewMode = {};
+
+  for (unsigned I = 0; I < OpNum; ++I) {
+    const MachineOperand *Op = TII.getNamedOperand(MI, Ops[I]);
+
+    std::optional<unsigned> MSBits;
+    if (Op)
+      MSBits = getMSBs(*Op, TRI);
+
+#if !defined(NDEBUG)
+    if (MSBits.has_value() && Ops2) {
+      const MachineOperand *Op2 = TII.getNamedOperand(MI, Ops2[I]);
+      if (Op2) {
+        std::optional<unsigned> MSBits2 = getMSBs(*Op2, TRI);
+        if (MSBits2.has_value() && MSBits != MSBits2)
+          llvm_unreachable("Invalid VOPD pair was created");
+      }
+    }
+#endif
+
+    if (!MSBits.has_value() && Ops2) {
+      Op = TII.getNamedOperand(MI, Ops2[I]);
+      if (Op)
+        MSBits = getMSBs(*Op, TRI);
+    }
+
+    if (!MSBits.has_value())
+      continue;
+
+    if (Ops[I] == AMDGPU::OpName::src2 && !Op->isDef() && Op->isTied() &&
+        (SIInstrInfo::isVOP2(MI) ||
+         (SIInstrInfo::isVOP3(MI) &&
+          TII.hasVALU32BitEncoding(MI.getOpcode()))))
+      continue;
+
+    NewMode.Ops[I].MSBits = MSBits.value();
+  }
+}
+
+/// Convert mode value from S_SET_VGPR_MSB format to MODE register format.
+/// S_SET_VGPR_MSB uses: (src0[0-1], src1[2-3], src2[4-5], dst[6-7])
+/// MODE register uses:  (dst[0-1], src0[2-3], src1[4-5], src2[6-7])
+/// This is a left rotation by 2 bits on an 8-bit value.
+inline int64_t convertModeToSetregFormat(int64_t Mode) {
+  assert(isUInt<8>(Mode) && "Mode expected to be 8-bit");
+  return llvm::rotl<uint8_t>(static_cast<uint8_t>(Mode), /*R=*/2);
+}
+
+/// Look ahead from iterator \p I in \p MBB to find the VGPR MSB mode needed
+/// by the next VALU instruction. Returns the encoded mode value, or
+/// std::nullopt if no VGPR instruction is found before block end or a
+/// terminator.
+inline std::optional<int64_t>
+computeNextMSB(MachineBasicBlock::instr_iterator I, MachineBasicBlock *MBB,
+               const SIInstrInfo &TII, const SIRegisterInfo &TRI) {
+  auto E = MBB->instr_end();
+  for (auto Next = std::next(I); Next != E; ++Next) {
+    if (Next->isMetaInstruction())
+      continue;
+    if (Next->isTerminator() || Next->isCall() || Next->isInlineAsm())
+      return std::nullopt;
+    auto Ops = AMDGPU::getVGPRLoweringOperandTables(Next->getDesc());
+    if (!Ops.first)
+      continue;
+    ModeTy NextMode;
+    computeMode(NextMode, *Next, Ops.first, Ops.second, TII, TRI);
+    bool HasVGPR = llvm::any_of(NextMode.Ops, [](const OpMode &Op) {
+      return Op.MSBits.has_value();
+    });
+    if (HasVGPR)
+      return NextMode.encode();
+  }
+  return std::nullopt;
+}
+
+/// Compute the accumulated VGPR MSB mode from the beginning of the block
+/// up to (not including) \p SetregI. This approximates what CurrentMode
+/// would be in AMDGPULowerVGPREncoding when it reaches the setreg.
+inline unsigned computeCurrentMSB(MachineBasicBlock::instr_iterator SetregI,
+                                  MachineBasicBlock *MBB,
+                                  const SIInstrInfo &TII,
+                                  const SIRegisterInfo &TRI) {
+  ModeTy CurrentMode;
+  for (auto I = MBB->instr_begin(); I != SetregI; ++I) {
+    if (I->isMetaInstruction())
+      continue;
+    if (I->isTerminator() || I->isCall())
+      break;
+    auto Ops = AMDGPU::getVGPRLoweringOperandTables(I->getDesc());
+    if (!Ops.first)
+      continue;
+    ModeTy NewMode;
+    computeMode(NewMode, *I, Ops.first, Ops.second, TII, TRI);
+    bool Unused = false;
+    CurrentMode.update(NewMode, Unused);
+  }
+  return CurrentMode.encode();
+}
+
+/// Predict whether AMDGPULowerVGPREncoding will place S_SET_VGPR_MSB
+/// immediately after an S_SETREG_IMM32_B32 (MODE) with size > 12.
+///
+/// The prediction mirrors handleSetregMode's logic:
+/// 1. If imm[12:19] matches the next VALU's MSB → no S_SET_VGPR_MSB (skip)
+/// 2. If CurrentMode is zero → handleSetregMode won't insert, but setMode
+///    for the next VALU might insert right after setreg if it needs high VGPRs
+/// 3. If CurrentMode is non-zero and imm[12:19] doesn't match → insert
+inline bool willSetregNeedVGPRMSB(MachineInstr &MI, MachineBasicBlock *MBB,
+                                  const SIInstrInfo &TII,
+                                  const SIRegisterInfo &TRI) {
+  assert(MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32);
+
+  const MachineOperand *ImmOp = TII.getNamedOperand(MI, AMDGPU::OpName::imm);
+  assert(ImmOp && "ImmOp must be present");
+  int64_t ImmBits12To19 =
+      (ImmOp->getImm() & AMDGPU::Hwreg::VGPR_MSB_MASK) >> VGPRMSBShift;
+
+  auto NextMSB = computeNextMSB(MI.getIterator(), MBB, TII, TRI);
+
+  // If the next VALU's MSB matches imm[12:19], AMDGPULowerVGPREncoding
+  // will detect this and skip S_SET_VGPR_MSB entirely.
+  if (NextMSB) {
+    int64_t NextSetregMode = convertModeToSetregFormat(*NextMSB);
+    if (ImmBits12To19 == NextSetregMode)
+      return false;
+  }
+
+  // Compute what CurrentMode would be at the setreg. If it's zero (no
+  // preceding high VGPRs) or imm[12:19] already matches it, handleSetregMode
+  // won't insert. But setMode for the next VALU may still insert right after
+  // setreg if the next VALU needs high VGPRs with a different mode.
+  unsigned CurrentMSB = computeCurrentMSB(MI.getIterator(), MBB, TII, TRI);
+  if (CurrentMSB == 0 ||
+      ImmBits12To19 == convertModeToSetregFormat(CurrentMSB))
+    return NextMSB && *NextMSB != 0;
+
+  return true;
+}
+
+} // namespace VGPREncoding
+} // namespace AMDGPU
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUVGPRENCODING_H
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 30a9d1d2ab149..caa3bbcbd9217 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "GCNHazardRecognizer.h"
+#include "AMDGPUVGPREncoding.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIMachineFunctionInfo.h"
@@ -63,6 +64,11 @@ static cl::opt<bool> EnableWMMAVnopHoisting(
     "amdgpu-wmma-vnop-hoisting", cl::init(true), cl::Hidden,
     cl::desc("Hoist WMMA hazard V_NOPs from loops to preheaders"));
 
+static cl::opt<unsigned> SetregModeToVGPRMSBHazardNOPs(
+    "amdgpu-setreg-vgpr-msb-nops", cl::init(1), cl::Hidden,
+    cl::desc("Number of s_nop cycles after s_setreg_imm32_b32 (MODE) "
+             "for VGPR MSB hazard on GFX1250 (emitted as s_nop N-1)"));
+
 //===----------------------------------------------------------------------===//
 // Hazard Recognizer Implementation
 //===----------------------------------------------------------------------===//
@@ -1308,6 +1314,8 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
     fixScratchBaseForwardingHazard(MI);
   if (ST.setRegModeNeedsVNOPs())
     fixSetRegMode(MI);
+  if (ST.has1024AddressableVGPRs())
+    fixSetRegModeToVGPRMSBHazard(MI);
 }
 
 static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI,
@@ -3865,3 +3873,29 @@ bool GCNHazardRecognizer::fixSetRegMode(MachineInstr *MI) {
   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
   return true;
 }
+
+bool GCNHazardRecognizer::fixSetRegModeToVGPRMSBHazard(MachineInstr *MI) {
+  if (MI->getOpcode() != AMDGPU::S_SETREG_IMM32_B32)
+    return false;
+
+  using namespace AMDGPU::Hwreg;
+  auto [HwRegId, Offset, Size] =
+      HwregEncoding::decode(MI->getOperand(1).getImm());
+  (void)Offset;
+  if (HwRegId != ID_MODE || Size <= 12)
+    return false;
+
+  MachineBasicBlock *MBB = MI->getParent();
+  const SIRegisterInfo &TRI = TII.getRegisterInfo();
+
+  if (!AMDGPU::VGPREncoding::willSetregNeedVGPRMSB(*MI, MBB, TII, TRI))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "  Inserting s_nop " << SetregModeToVGPRMSBHazardNOPs - 1
+                    << " after s_setreg_imm32_b32 (MODE)"
+                    << " for VGPR MSB hazard\n");
+  auto NextI = std::next(MI->getIterator());
+  BuildMI(*MBB, NextI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
+      .addImm(SetregModeToVGPRMSBHazardNOPs - 1);
+  return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index b331504d40113..52b54e8581f6b 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -137,6 +137,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   bool fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI);
   bool fixScratchBaseForwardingHazard(MachineInstr *MI);
   bool fixSetRegMode(MachineInstr *MI);
+  bool fixSetRegModeToVGPRMSBHazard(MachineInstr *MI);
 
   int checkMAIHazards(MachineInstr *MI);
   int checkMAIHazards908(MachineInstr *MI);
diff --git a/llvm/test/CodeGen/AMDGPU/hazard-setreg-vgpr-msb-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/hazard-setreg-vgpr-msb-gfx1250.mir
new file mode 100644
index 0000000000000..66059ddd9de5b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hazard-setreg-vgpr-msb-gfx1250.mir
@@ -0,0 +1,117 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=post-RA-hazard-rec,amdgpu-lower-vgpr-encoding -o - %s | FileCheck %s
+
+# Test handling of the GFX1250 hardware hazard where S_SET_VGPR_MSB immediately
+# after S_SETREG_IMM32_B32 (MODE) is silently dropped.
+#
+# GCNHazardRecognizer inserts S_NOPs for Case 2 (size > 12) when preceding
+# instructions use high VGPRs. AMDGPULowerVGPREncoding skips over these S_NOPs
+# when placing S_SET_VGPR_MSB.
+
+---
+# Case 2 mismatch: setreg (size=16) with imm32[12:19] that doesn't match
+# current VGPR MSB. GCNHazardRecognizer inserts S_NOPs, then
+# AMDGPULowerVGPREncoding places S_SET_VGPR_MSB after them.
+name: setreg_mode_size_gt_12_mismatch
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: setreg_mode_size_gt_12_mismatch
+    ; CHECK: S_SET_VGPR_MSB 64, implicit-def $mode
+    ; CHECK-NEXT: $vgpr256 = V_MOV_B32_e32 undef $sgpr0, implicit $exec
+    ; CHECK-NEXT: S_SETREG_IMM32_B32 146108, 30721, implicit-def $mode, implicit $mode
+    ; CHECK-NEXT: S_NOP 0
+    ; CHECK-NEXT: S_SET_VGPR_MSB 64, implicit-def $mode
+    ; CHECK-NEXT: S_ENDPGM 0
+    $vgpr256 = V_MOV_B32_e32 undef $sgpr0, implicit $exec
+    ; hwreg(MODE, 0, 16): simm16 = 0x7801 = 30721
+    ; imm32 = 0x23ABC = 146108 (bits 12:19 = 0x23, doesn't match VGPR MSB mode)
+    S_SETREG_IMM32_B32 146108, 30721, implicit-def $mode, implicit $mode
+    S_ENDPGM 0
+...
+
+---
+# Case 2 matches next MSB: setreg (size=16) with imm32[12:19] matching the
+# NEXT instruction's MSB. GCNHazardRecognizer predicts that
+# AMDGPULowerVGPREncoding will skip S_SET_VGPR_MSB (since imm matches next),
+# so no S_NOPs are inserted.
+#
+# v256/v257 -> mode 65, MODE format 5
+# v512/v513 -> mode 130, MODE format 10 = 0xA
+# imm32 = 0xAABC (bits 12:19 = 0xA = 10, matches next MSB)
+name: setreg_mode_size_gt_12_matches_next
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: setreg_mode_size_gt_12_matches_next
+    ; CHECK: S_SET_VGPR_MSB 65, implicit-def $mode
+    ; CHECK-NEXT: $vgpr256 = V_MOV_B32_e32 undef $vgpr257, implicit $exec
+    ; CHECK-NEXT: S_SETREG_IMM32_B32 43708, 30721, implicit-def $mode, implicit $mode
+    ; CHECK-NEXT: $vgpr512 = V_MOV_B32_e32 undef $vgpr513, implicit $exec
+    ; CHECK-NEXT: S_ENDPGM 0
+    $vgpr256 = V_MOV_B32_e32 undef $vgpr257, implicit $exec
+    ; hwreg(MODE, 0, 16): simm16 = 0x7801 = 30721
+    ; imm32 = 0xAABC = 43708 (bits 12:19 = 0xA = 10, matches next MSB for v512/v513)
+    S_SETREG_IMM32_B32 43708, 30721, implicit-def $mode, implicit $mode
+    $vgpr512 = V_MOV_B32_e32 undef $vgpr513, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+# No hazard: S_SETREG_IMM32_B32 targeting non-MODE register.
+name: setreg_non_mode_no_hazard
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: setreg_non_mode_no_hazard
+    ; CHECK: S_SET_VGPR_MSB 64, implicit-def $mode
+    ; CHECK-NEXT: $vgpr256 = V_MOV_B32_e32 undef $sgpr0, implicit $exec
+    ; CHECK-NEXT: S_SETREG_IMM32_B32 0, 2178, implicit-def $mode, implicit $mode
+    ; CHECK-NEXT: S_SET_VGPR_MSB 16384, implicit-def $mode
+    ; CHECK-NEXT: $vgpr0 = V_ADD_F32_e32 undef $vgpr1, undef $vgpr2, implicit $exec, implicit $mode
+    ; CHECK-NEXT: S_ENDPGM 0
+    $vgpr256 = V_MOV_B32_e32 undef $sgpr0, implicit $exec
+    ; hwreg(STATUS, 2, 2): simm16 = 2 | (2 << 6) | (1 << 11) = 0x882 = 2178
+    S_SETREG_IMM32_B32 0, 2178, implicit-def $mode, implicit $mode
+    $vgpr0 = V_ADD_F32_e32 undef $vgpr1, undef $vgpr2, implicit $exec, implicit $mode
+    S_ENDPGM 0
+...
+
+---
+# No hazard: Case 2 but no high VGPRs in the block at all.
+name: setreg_mode_size_gt_12_no_high_vgpr
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: setreg_mode_size_gt_12_no_high_vgpr
+    ; CHECK: $vgpr0 = V_MOV_B32_e32 undef $sgpr0, implicit $exec
+    ; CHECK-NEXT: S_SETREG_IMM32_B32 146108, 30721, implicit-def $mode, implicit $mode
+    ; CHECK-NEXT: S_ENDPGM 0
+    $vgpr0 = V_MOV_B32_e32 undef $sgpr0, implicit $exec
+    ; hwreg(MODE, 0, 16): simm16 = 0x7801 = 30721
+    S_SETREG_IMM32_B32 146108, 30721, implicit-def $mode, implicit $mode
+    S_ENDPGM 0
+...
+
+---
+# Case 2 with high VGPR only AFTER setreg: setreg (size=16) with low VGPRs
+# before but high VGPRs after. GCNHazardRecognizer predicts that the next
+# VALU needs high VGPRs and imm[12:19] doesn't match, so inserts S_NOPs.
+# AMDGPULowerVGPREncoding's setMode places S_SET_VGPR_MSB after the NOPs.
+name: setreg_mode_size_gt_12_high_vgpr_after
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: setreg_mode_size_gt_12_high_vgpr_after
+    ; CHECK: $vgpr0 = V_MOV_B32_e32 undef $sgpr0, implicit $exec
+    ; CHECK-NEXT: S_SETREG_IMM32_B32 146108, 30721, implicit-def $mode, implicit $mode
+    ; CHECK-NEXT: S_NOP 0
+    ; CHECK-NEXT: S_SET_VGPR_MSB 64, implicit-def $mode
+    ; CHECK-NEXT: $vgpr256 = V_MOV_B32_e32 undef $sgpr0, implicit $exec
+    ; CHECK-NEXT: S_ENDPGM 0
+    $vgpr0 = V_MOV_B32_e32 undef $sgpr0, implicit $exec
+    ; hwreg(MODE, 0, 16): simm16 = 0x7801 = 30721
+    S_SETREG_IMM32_B32 146108, 30721, implicit-def $mode, implicit $mode
+    $vgpr256 = V_MOV_B32_e32 undef $sgpr0, implicit $exec
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-setreg-mode-swar.mir b/llvm/test/CodeGen/AMDGPU/vgpr-setreg-mode-swar.mir
index ecfc3cdcd215c..719ec9f093aae 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-setreg-mode-swar.mir
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-setreg-mode-swar.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=amdgpu-lower-vgpr-encoding -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=post-RA-hazard-rec,amdgpu-lower-vgpr-encoding -o - %s | FileCheck %s
 
 ---
 # Case 1a: Size < 12 (size=4), imm32[12:19]=0
@@ -94,6 +94,7 @@ body:             |
     ; CHECK: S_SET_VGPR_MSB 65, implicit-def $mode
     ; CHECK-NEXT: $vgpr256 = V_MOV_B32_e32 $vgpr257, implicit $exec
     ; CHECK-NEXT: S_SETREG_IMM32_B32 146108, 30721, implicit-def $mode, implicit $mode
+    ; CHECK-NEXT: S_NOP 0
     ; CHECK-NEXT: S_SET_VGPR_MSB 65, implicit-def $mode
     ; CHECK-NEXT: S_ENDPGM 0
     $vgpr256 = V_MOV_B32_e32 $vgpr257, implicit $exec
@@ -231,6 +232,7 @@ body:             |
     ; CHECK: S_SET_VGPR_MSB 65, implicit-def $mode
     ; CHECK-NEXT: $vgpr256 = V_MOV_B32_e32 $vgpr257, implicit $exec
     ; CHECK-NEXT: S_SETREG_IMM32_B32 23228, 30721, implicit-def $mode, implicit $mode
+    ; CHECK-NEXT: S_NOP 0
     ; CHECK-NEXT: S_SET_VGPR_MSB 16770, implicit-def $mode
     ; CHECK-NEXT: $vgpr512 = V_MOV_B32_e32 $vgpr513, implicit $exec
     ; CHECK-NEXT: S_ENDPGM 0