[llvm] 853b2a8 - [AMDGPU] Reserve SGPR pair when long branches are present

Thu Jun 29 14:53:11 PDT 2023

Author: Brendon Cahoon
Date: 2023-06-29T16:50:46-05:00
New Revision: 853b2a84cb9902725752e9603011041ebe33c7bf

URL: https://github.com/llvm/llvm-project/commit/853b2a84cb9902725752e9603011041ebe33c7bf
DIFF: https://github.com/llvm/llvm-project/commit/853b2a84cb9902725752e9603011041ebe33c7bf.diff

LOG: [AMDGPU] Reserve SGPR pair when long branches are present

Branch relaxation requires 2 additional SGPRs for AMDGPU to handle the
case when an indirect branch target is too far away. The register
scavanger may not find available registers, which causes a “did not find
scavenging index” assert to occur in assignRegToScavengingIndex.

In this patch, we estimate before register allocation whether an
indirect branch is likely to be needed, and reserve 2 SGPRs if the
branch distance is found to be above a threshold. The distance threshold
is an approximation as the exact code size and branch distance are
unknown prior to register allocation.

Patch by Corbin Robeck. Thanks!

Differential Review: https://reviews.llvm.org/D149775

Added: 
    llvm/lib/Target/AMDGPU/GCNPreRALongBranchReg.cpp
    llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll
    llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
    llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
    llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPU.h
    llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/lib/Target/AMDGPU/CMakeLists.txt
    llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
    llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
    llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
    llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
    llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
    llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
    llvm/test/CodeGen/AMDGPU/literal-constant-like-operand-instruction-size.ll
    llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
    llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
    llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
    llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 0ddc76756ec385..c25194c02f72bf 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -327,6 +327,9 @@ extern char &AMDGPUOpenCLEnqueuedBlockLoweringID;
 void initializeGCNNSAReassignPass(PassRegistry &);
 extern char &GCNNSAReassignID;
 
+void initializeGCNPreRALongBranchRegPass(PassRegistry &);
+extern char &GCNPreRALongBranchRegID;
+
 void initializeGCNPreRAOptimizationsPass(PassRegistry &);
 extern char &GCNPreRAOptimizationsID;
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index a3599fdd328387..cf466e810d9189 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -421,6 +421,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUResourceUsageAnalysisPass(*PR);
   initializeGCNNSAReassignPass(*PR);
   initializeGCNPreRAOptimizationsPass(*PR);
+  initializeGCNPreRALongBranchRegPass(*PR);
   initializeGCNRewritePartialRegUsesPass(*PR);
 }
 
@@ -1341,6 +1342,8 @@ bool GCNPassConfig::addRegAssignAndRewriteFast() {
   if (!usingDefaultRegAlloc())
     report_fatal_error(RegAllocOptNotSupportedMessage);
 
+  addPass(&GCNPreRALongBranchRegID);
+
   addPass(createSGPRAllocPass(false));
 
   // Equivalent of PEI for SGPRs.
@@ -1354,6 +1357,8 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
   if (!usingDefaultRegAlloc())
     report_fatal_error(RegAllocOptNotSupportedMessage);
 
+  addPass(&GCNPreRALongBranchRegID);
+
   addPass(createSGPRAllocPass(true));
 
   // Commit allocated register changes. This is mostly necessary because too
@@ -1476,6 +1481,10 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
   if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
     return true;
 
+  if (parseOptionalRegister(YamlMFI.LongBranchReservedReg,
+                            MFI->LongBranchReservedReg))
+    return true;
+
   auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
     // Create a diagnostic for a the register string literal.
     const MemoryBuffer &Buffer =

diff  --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 2b2788680d94a4..08dbeba35766fa 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -107,6 +107,7 @@ add_llvm_target(AMDGPUCodeGen
   GCNMinRegStrategy.cpp
   GCNNSAReassign.cpp
   GCNPreRAOptimizations.cpp
+  GCNPreRALongBranchReg.cpp
   GCNRegPressure.cpp
   GCNRewritePartialRegUses.cpp
   GCNSchedStrategy.cpp

diff  --git a/llvm/lib/Target/AMDGPU/GCNPreRALongBranchReg.cpp b/llvm/lib/Target/AMDGPU/GCNPreRALongBranchReg.cpp
new file mode 100644
index 00000000000000..b50af38683ed1b
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/GCNPreRALongBranchReg.cpp
@@ -0,0 +1,139 @@
+//===-- GCNPreRALongBranchReg.cpp ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// \file
+// \brief Pass to estimate pre RA branch size and reserve a pair of SGPRs if
+// there is a long branch. Branch size at this point is 
diff icult to track since
+// we have no idea what spills will be inserted later on. We just assume 8 bytes
+// per instruction to compute approximations without computing the actual
+// instruction size to see if we're in the neighborhood of the maximum branch
+// distrance threshold tuning of what is considered "long" is handled through
+// amdgpu-long-branch-factor cl argument which sets LongBranchFactor.
+//===----------------------------------------------------------------------===//
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-pre-ra-long-branch-reg"
+
+namespace {
+
+static cl::opt<double> LongBranchFactor(
+    "amdgpu-long-branch-factor", cl::init(1.0), cl::Hidden,
+    cl::desc("Factor to apply to what qualifies as a long branch "
+             "to reserve a pair of scalar registers. If this value "
+             "is 0 the long branch registers are never reserved. As this "
+             "value grows the greater chance the branch distance will fall "
+             "within the threshold and the registers will be marked to be "
+             "reserved. We lean towards always reserving a register for  "
+             "long jumps"));
+
+class GCNPreRALongBranchReg : public MachineFunctionPass {
+
+  struct BasicBlockInfo {
+    // Offset - Distance from the beginning of the function to the beginning
+    // of this basic block.
+    uint64_t Offset = 0;
+    // Size - Size of the basic block in bytes
+    uint64_t Size = 0;
+  };
+  void generateBlockInfo(MachineFunction &MF,
+                         SmallVectorImpl<BasicBlockInfo> &BlockInfo);
+
+public:
+  static char ID;
+  GCNPreRALongBranchReg() : MachineFunctionPass(ID) {
+    initializeGCNPreRALongBranchRegPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  StringRef getPassName() const override {
+    return "AMDGPU Pre-RA Long Branch Reg";
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+} // End anonymous namespace.
+char GCNPreRALongBranchReg::ID = 0;
+
+INITIALIZE_PASS(GCNPreRALongBranchReg, DEBUG_TYPE,
+                "AMDGPU Pre-RA Long Branch Reg", false, false)
+
+char &llvm::GCNPreRALongBranchRegID = GCNPreRALongBranchReg::ID;
+void GCNPreRALongBranchReg::generateBlockInfo(
+    MachineFunction &MF, SmallVectorImpl<BasicBlockInfo> &BlockInfo) {
+
+  BlockInfo.resize(MF.getNumBlockIDs());
+
+  // Approximate the size of all basic blocks by just
+  // assuming 8 bytes per instruction
+  for (const MachineBasicBlock &MBB : MF) {
+    uint64_t NumInstr = 0;
+    // Loop through the basic block and add up all non-debug
+    // non-meta instructions
+    for (const MachineInstr &MI : MBB) {
+      // isMetaInstruction is a superset of isDebugIstr
+      if (MI.isMetaInstruction())
+        continue;
+      NumInstr += 1;
+    }
+    // Approximate size as just 8 bytes per instruction
+    BlockInfo[MBB.getNumber()].Size = 8 * NumInstr;
+  }
+  uint64_t PrevNum = (&MF)->begin()->getNumber();
+  for (auto &MBB :
+       make_range(std::next(MachineFunction::iterator((&MF)->begin())),
+                  (&MF)->end())) {
+    uint64_t Num = MBB.getNumber();
+    // Compute the offset immediately following this block.
+    BlockInfo[Num].Offset = BlockInfo[PrevNum].Offset + BlockInfo[PrevNum].Size;
+    PrevNum = Num;
+  }
+}
+bool GCNPreRALongBranchReg::runOnMachineFunction(MachineFunction &MF) {
+  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+  const SIInstrInfo *TII = STM.getInstrInfo();
+  const SIRegisterInfo *TRI = STM.getRegisterInfo();
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // For now, reserve highest available SGPR pair. After RA,
+  // shift down to a lower unused pair of SGPRs
+  // If all registers are used, then findUnusedRegister will return
+  // AMDGPU::NoRegister.
+  constexpr bool ReserveHighestRegister = true;
+  Register LongBranchReservedReg = TRI->findUnusedRegister(
+      MRI, &AMDGPU::SGPR_64RegClass, MF, ReserveHighestRegister);
+  if (!LongBranchReservedReg)
+    return false;
+
+  // Approximate code size and offsets of each basic block
+  SmallVector<BasicBlockInfo, 16> BlockInfo;
+  generateBlockInfo(MF, BlockInfo);
+
+  for (const MachineBasicBlock &MBB : MF) {
+    MachineBasicBlock::const_iterator Last = MBB.getLastNonDebugInstr();
+    if (Last == MBB.end() || !Last->isUnconditionalBranch())
+      continue;
+    MachineBasicBlock *DestBB = TII->getBranchDestBlock(*Last);
+    uint64_t BlockDistance = static_cast<uint64_t>(
+        LongBranchFactor * BlockInfo[DestBB->getNumber()].Offset);
+    // If the distance falls outside the threshold assume it is a long branch
+    // and we need to reserve the registers
+    if (!TII->isBranchOffsetInRange(Last->getOpcode(), BlockDistance)) {
+      MFI->setLongBranchReservedReg(LongBranchReservedReg);
+      return true;
+    }
+  }
+  return false;
+}

diff  --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 6d9a2390d3d8fd..fe5e8e1c59cd97 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1437,13 +1437,27 @@ void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced(
         TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
     if (UnusedLowVGPR && (TRI->getHWRegIndex(UnusedLowVGPR) <
                           TRI->getHWRegIndex(VGPRForAGPRCopy))) {
-      // Call to setVGPRForAGPRCopy() should happen first before calling
-      // freezeReservedRegs() so that getReservedRegs() can reserve this newly
-      // identified VGPR (for AGPR copy).
+      // Reserve this newly identified VGPR (for AGPR copy)
+      // reserved registers should already be frozen at this point
+      // so we can avoid calling MRI.freezeReservedRegs and just use
+      // MRI.reserveReg
       FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR);
-      MRI.freezeReservedRegs(MF);
+      MRI.reserveReg(UnusedLowVGPR, TRI);
     }
   }
+  // We initally reserved the highest available SGPR pair for long branches
+  // now, after RA, we shift down to a lower unused one if one exists
+  Register LongBranchReservedReg = FuncInfo->getLongBranchReservedReg();
+  Register UnusedLowSGPR =
+      TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_64RegClass, MF);
+  // If LongBranchReservedReg is null then we didn't find a long branch
+  // and never reserved a register to begin with so there is nothing to
+  // shift down. Then if UnusedLowSGPR is null, there isn't available lower
+  // register to use so just keep the original one we set.
+  if (LongBranchReservedReg && UnusedLowSGPR) {
+    FuncInfo->setLongBranchReservedReg(UnusedLowSGPR);
+    MRI.reserveReg(UnusedLowSGPR, TRI);
+  }
 }
 
 // The special SGPR spills like the one needed for FP, BP or any reserved

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 1e3e518cb0a6f6..bcb27826a26752 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2550,6 +2550,7 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
 
   MachineFunction *MF = MBB.getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
+  const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
 
   // FIXME: Virtual register workaround for RegScavenger not working with empty
   // blocks.
@@ -2613,10 +2614,20 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
   // dest_bb:
   //   buzz;
 
-  RS->enterBasicBlockEnd(MBB);
-  Register Scav = RS->scavengeRegisterBackwards(
-      AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
-      /* RestoreAfter */ false, 0, /* AllowSpill */ false);
+  Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
+  Register Scav;
+
+  // If we've previously reserved a register for long branches
+  // avoid running the scavenger and just use those registers
+  if (LongBranchReservedReg) {
+    RS->enterBasicBlock(MBB);
+    Scav = LongBranchReservedReg;
+  } else {
+    RS->enterBasicBlockEnd(MBB);
+    Scav = RS->scavengeRegisterBackwards(
+        AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
+        /* RestoreAfter */ false, 0, /* AllowSpill */ false);
+  }
   if (Scav) {
     RS->setRegUsed(Scav);
     MRI.replaceRegWith(PCReg, Scav);

diff  --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 8eaec957b3835d..6ec028c0047869 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -648,6 +648,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
   for (Register Reg : MFI.getWWMReservedRegs())
     WWMReservedRegs.push_back(regToString(Reg, TRI));
 
+  if (MFI.getLongBranchReservedReg())
+    LongBranchReservedReg = regToString(MFI.getLongBranchReservedReg(), TRI);
   if (MFI.getVGPRForAGPRCopy())
     VGPRForAGPRCopy = regToString(MFI.getVGPRForAGPRCopy(), TRI);
   auto SFI = MFI.getOptionalScavengeFI();

diff  --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index df3abdfb653176..1cf88d78f66bd0 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -283,6 +283,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
   SIMode Mode;
   std::optional<FrameIndex> ScavengeFI;
   StringValue VGPRForAGPRCopy;
+  StringValue LongBranchReservedReg;
 
   SIMachineFunctionInfo() = default;
   SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &,
@@ -326,6 +327,8 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
     YamlIO.mapOptional("scavengeFI", MFI.ScavengeFI);
     YamlIO.mapOptional("vgprForAGPRCopy", MFI.VGPRForAGPRCopy,
                        StringValue()); // Don't print out when it's empty.
+    YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg,
+                       StringValue());
   }
 };
 
@@ -381,6 +384,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
   // base to the beginning of the new function's frame.
   Register StackPtrOffsetReg = AMDGPU::SP_REG;
 
+  // Registers that may be reserved when RA doesn't allocate enough
+  // registers to plan for the case where an indirect branch ends up
+  // being needed during branch relaxation.
+  Register LongBranchReservedReg;
+
   AMDGPUFunctionArgInfo ArgInfo;
 
   // Graphics info.
@@ -891,6 +899,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
     StackPtrOffsetReg = Reg;
   }
 
+  void setLongBranchReservedReg(Register Reg) { LongBranchReservedReg = Reg; }
+
   // Note the unset value for this is AMDGPU::SP_REG rather than
   // NoRegister. This is mostly a workaround for MIR tests where state that
   // can't be directly computed from the function is not preserved in serialized
@@ -899,6 +909,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
     return StackPtrOffsetReg;
   }
 
+  Register getLongBranchReservedReg() const { return LongBranchReservedReg; }
+
   Register getQueuePtrUserSGPR() const {
     return ArgInfo.QueuePtr.getRegister();
   }

diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index d63ccf426a9c64..977ae9f67a2ca3 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -615,6 +615,10 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     reserveRegisterTuples(Reserved, ScratchRSrcReg);
   }
 
+  Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
+  if (LongBranchReservedReg)
+    reserveRegisterTuples(Reserved, LongBranchReservedReg);
+
   // We have to assume the SP is needed in case there are calls in the function,
   // which is detected after the function is lowered. If we aren't really going
   // to need SP, don't bother reserving it.
@@ -2878,13 +2882,12 @@ bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
 
 /// Returns a lowest register that is not used at any point in the function.
 ///        If all registers are used, then this function will return
-///         AMDGPU::NoRegister. If \p ReserveHighestVGPR = true, then return
+///         AMDGPU::NoRegister. If \p ReserveHighestRegister = true, then return
 ///         highest unused register.
-MCRegister SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
-                                              const TargetRegisterClass *RC,
-                                              const MachineFunction &MF,
-                                              bool ReserveHighestVGPR) const {
-  if (ReserveHighestVGPR) {
+MCRegister SIRegisterInfo::findUnusedRegister(
+    const MachineRegisterInfo &MRI, const TargetRegisterClass *RC,
+    const MachineFunction &MF, bool ReserveHighestRegister) const {
+  if (ReserveHighestRegister) {
     for (MCRegister Reg : reverse(*RC))
       if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
         return Reg;

diff  --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
index 7549805121f069..a8b92df4c7ec84 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -verify-machineinstrs -amdgpu-s-branch-bits=5 -o - %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -verify-machineinstrs -amdgpu-s-branch-bits=5 -amdgpu-long-branch-factor=0 -o - %s | FileCheck %s
 
 define amdgpu_kernel void @spill(ptr addrspace(1) %arg, i32 %cnd) #0 {
 ; CHECK-LABEL: spill:

diff  --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
index 9e1e61848320bb..5c7ece9015b9ac 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-s-branch-bits=4 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-s-branch-bits=4 -simplifycfg-require-and-preserve-domtree=1 -amdgpu-long-branch-factor=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
 
 ; FIXME: We should use llvm-mc for this, but we can't even parse our own output.
 ;        See PR33579.
-; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-s-branch-bits=4 -o %t.o -filetype=obj -simplifycfg-require-and-preserve-domtree=1 %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-s-branch-bits=4 -amdgpu-long-branch-factor=0 -o %t.o -filetype=obj -simplifycfg-require-and-preserve-domtree=1 %s
 ; RUN: llvm-readobj -r %t.o | FileCheck --check-prefix=OBJ %s
 
 ; OBJ:       Relocations [

diff  --git a/llvm/test/CodeGen/AMDGPU/literal-constant-like-operand-instruction-size.ll b/llvm/test/CodeGen/AMDGPU/literal-constant-like-operand-instruction-size.ll
index 4417a280e5101c..5eaf0a8d7848f6 100644
--- a/llvm/test/CodeGen/AMDGPU/literal-constant-like-operand-instruction-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/literal-constant-like-operand-instruction-size.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs -amdgpu-s-branch-bits=6 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs -amdgpu-s-branch-bits=6 -amdgpu-long-branch-factor=0 < %s | FileCheck -check-prefix=GCN %s
 
 
 ; Restrict maximum branch to between +31 and -32 dwords

diff  --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 35750da0883a54..b9423700ea1f5f 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -114,6 +114,7 @@
 ; GCN-O0-NEXT:        Virtual Register Map
 ; GCN-O0-NEXT:        Live Register Matrix
 ; GCN-O0-NEXT:        SI Pre-allocate WWM Registers
+; GCN-O0-NEXT:        AMDGPU Pre-RA Long Branch Reg
 ; GCN-O0-NEXT:        Fast Register Allocator
 ; GCN-O0-NEXT:        SI lower SGPR spill instructions
 ; GCN-O0-NEXT:        Fast Register Allocator
@@ -349,6 +350,7 @@
 ; GCN-O1-NEXT:        Live Register Matrix
 ; GCN-O1-NEXT:        SI Pre-allocate WWM Registers
 ; GCN-O1-NEXT:        SI optimize exec mask operations pre-RA
+; GCN-O1-NEXT:        AMDGPU Pre-RA Long Branch Reg
 ; GCN-O1-NEXT:        Machine Natural Loop Construction
 ; GCN-O1-NEXT:        Machine Block Frequency Analysis
 ; GCN-O1-NEXT:        Debug Variable Analysis
@@ -647,6 +649,7 @@
 ; GCN-O1-OPTS-NEXT:        Live Register Matrix
 ; GCN-O1-OPTS-NEXT:        SI Pre-allocate WWM Registers
 ; GCN-O1-OPTS-NEXT:        SI optimize exec mask operations pre-RA
+; GCN-O1-OPTS-NEXT:        AMDGPU Pre-RA Long Branch Reg
 ; GCN-O1-OPTS-NEXT:        Machine Natural Loop Construction
 ; GCN-O1-OPTS-NEXT:        Machine Block Frequency Analysis
 ; GCN-O1-OPTS-NEXT:        Debug Variable Analysis
@@ -955,6 +958,7 @@
 ; GCN-O2-NEXT:        SI Pre-allocate WWM Registers
 ; GCN-O2-NEXT:        SI optimize exec mask operations pre-RA
 ; GCN-O2-NEXT:        SI Form memory clauses
+; GCN-O2-NEXT:        AMDGPU Pre-RA Long Branch Reg
 ; GCN-O2-NEXT:        Machine Natural Loop Construction
 ; GCN-O2-NEXT:        Machine Block Frequency Analysis
 ; GCN-O2-NEXT:        Debug Variable Analysis
@@ -1275,6 +1279,7 @@
 ; GCN-O3-NEXT:        SI Pre-allocate WWM Registers
 ; GCN-O3-NEXT:        SI optimize exec mask operations pre-RA
 ; GCN-O3-NEXT:        SI Form memory clauses
+; GCN-O3-NEXT:        AMDGPU Pre-RA Long Branch Reg
 ; GCN-O3-NEXT:        Machine Natural Loop Construction
 ; GCN-O3-NEXT:        Machine Block Frequency Analysis
 ; GCN-O3-NEXT:        Debug Variable Analysis

diff  --git a/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll b/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll
new file mode 100644
index 00000000000000..dc7d2eed53696e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll
@@ -0,0 +1,330 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-s-branch-bits=4 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+
+; OBJ:       Relocations [
+; OBJ-NEXT: ]
+
+; Used to emit an always 4 byte instruction. Inline asm always assumes
+; each instruction is the maximum size.
+declare void @llvm.amdgcn.s.sleep(i32) #0
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+
+define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 {
+; GCN-LABEL: uniform_conditional_max_short_forward_branch:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dword s2, s[0:1], 0xb
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_eq_u32 s2, 0
+; GCN-NEXT:    s_cbranch_scc1 .LBB0_2
+; GCN-NEXT:  ; %bb.1: ; %bb2
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_sleep 0
+; GCN-NEXT:  .LBB0_2: ; %bb3
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
+bb:
+  %cmp = icmp eq i32 %cnd, 0
+  br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch
+
+bb2:
+; 24 bytes
+  call void asm sideeffect
+  "v_nop_e64
+  v_nop_e64
+  v_nop_e64", ""() #0
+  call void @llvm.amdgcn.s.sleep(i32 0)
+  br label %bb3
+
+bb3:
+  store volatile i32 %cnd, ptr addrspace(1) %arg
+  ret void
+}
+
+define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 {
+; GCN-LABEL: uniform_conditional_min_long_forward_branch:
+; GCN:       ; %bb.0: ; %bb0
+; GCN-NEXT:  	 s_load_dword s2, s[0:1], 0xb
+; GCN-NEXT:  	 s_waitcnt lgkmcnt(0)
+; GCN-NEXT:  	 s_cmp_eq_u32 s2, 0
+; GCN-NEXT:  	 s_cbranch_scc0 .LBB1_1
+; GCN-NEXT:  .LBB1_3: ; %bb0
+; GCN-NEXT:  	 s_getpc_b64 s[8:9]
+; GCN-NEXT:  .Lpost_getpc0:
+; GCN-NEXT:  	 s_add_u32 s8, s8, (.LBB1_2-.Lpost_getpc0)&4294967295
+; GCN-NEXT:  	 s_addc_u32 s9, s9, (.LBB1_2-.Lpost_getpc0)>>32
+; GCN-NEXT:  	 s_setpc_b64 s[8:9]
+; GCN-NEXT:  .LBB1_1: ; %bb2
+; GCN-NEXT:  	 ;;#ASMSTART
+; GCN-NEXT:  	 v_nop_e64
+; GCN-NEXT:     v_nop_e64
+; GCN-NEXT:     v_nop_e64
+; GCN-NEXT:     v_nop_e64
+; GCN-NEXT:  	 ;;#ASMEND
+; GCN-NEXT:  .LBB1_2: ; %bb3
+; GCN-NEXT:  	 s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GCN-NEXT:  	 s_mov_b32 s7, 0xf000
+; GCN-NEXT:  	 s_mov_b32 s6, -1
+; GCN-NEXT:  	 v_mov_b32_e32 v0, s2
+; GCN-NEXT:  	 s_waitcnt lgkmcnt(0)
+; GCN-NEXT:  	 buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:  	 s_waitcnt vmcnt(0)
+; GCN-NEXT:  	 s_endpgm
+bb0:
+  %cmp = icmp eq i32 %cnd, 0
+  br i1 %cmp, label %bb3, label %bb2 ; +9 dword branch
+
+bb2:
+; 32 bytes
+  call void asm sideeffect
+  "v_nop_e64
+  v_nop_e64
+  v_nop_e64
+  v_nop_e64", ""() #0
+  br label %bb3
+
+bb3:
+  store volatile i32 %cnd, ptr addrspace(1) %arg
+  ret void
+}
+
+define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr addrspace(1) %arg, float %cnd) #0 {
+; GCN-LABEL: uniform_conditional_min_long_forward_vcnd_branch:
+; GCN:       ; %bb.0: ; %bb0
+; GCN-NEXT:    s_load_dword s2, s[0:1], 0xb
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_f32_e64 s[4:5], s2, 0
+; GCN-NEXT:    s_and_b64 vcc, exec, s[4:5]
+; GCN-NEXT:    s_cbranch_vccz .LBB2_1
+; GCN-NEXT:  .LBB2_3: ; %bb0
+; GCN-NEXT:    s_getpc_b64 s[8:9]
+; GCN-NEXT:  .Lpost_getpc1:
+; GCN-NEXT:    s_add_u32 s8, s8, (.LBB2_2-.Lpost_getpc1)&4294967295
+; GCN-NEXT:    s_addc_u32 s9, s9, (.LBB2_2-.Lpost_getpc1)>>32
+; GCN-NEXT:    s_setpc_b64 s[8:9]
+; GCN-NEXT:  .LBB2_1: ; %bb2
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:     ; 32 bytes
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:  .LBB2_2: ; %bb3
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
+bb0:
+  %cmp = fcmp oeq float %cnd, 0.0
+  br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch
+
+bb2:
+  call void asm sideeffect " ; 32 bytes
+  v_nop_e64
+  v_nop_e64
+  v_nop_e64
+  v_nop_e64", ""() #0
+  br label %bb3
+
+bb3:
+  store volatile float %cnd, ptr addrspace(1) %arg
+  ret void
+}
+
+define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 {
+; GCN-LABEL: min_long_forward_vbranch:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; GCN-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GCN-NEXT:    s_cbranch_execnz .LBB3_1
+; GCN-NEXT:  .LBB3_3: ; %bb
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:  .Lpost_getpc2:
+; GCN-NEXT:    s_add_u32 s4, s4, (.LBB3_2-.Lpost_getpc2)&4294967295
+; GCN-NEXT:    s_addc_u32 s5, s5, (.LBB3_2-.Lpost_getpc2)>>32
+; GCN-NEXT:    s_setpc_b64 s[4:5]
+; GCN-NEXT:  .LBB3_1: ; %bb2
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:     ; 32 bytes
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:  .LBB3_2: ; %bb3
+; GCN-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 s1, s2
+; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
+bb:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = zext i32 %tid to i64
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tid.ext
+  %load = load volatile i32, ptr addrspace(1) %gep
+  %cmp = icmp eq i32 %load, 0
+  br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch
+
+bb2:
+  call void asm sideeffect " ; 32 bytes
+  v_nop_e64
+  v_nop_e64
+  v_nop_e64
+  v_nop_e64", ""() #0
+  br label %bb3
+
+bb3:
+  store volatile i32 %load, ptr addrspace(1) %gep
+  ret void
+}
+
+define amdgpu_kernel void @long_backward_sbranch(ptr addrspace(1) %arg) #0 {
+; GCN-LABEL: long_backward_sbranch:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_mov_b32 s0, 0
+; GCN-NEXT:  .LBB4_1: ; %bb2
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    s_add_i32 s0, s0, 1
+; GCN-NEXT:    s_cmp_lt_i32 s0, 10
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_cbranch_scc0 .LBB4_2
+; GCN-NEXT:  .LBB4_3: ; %bb2
+; GCN-NEXT:    ; in Loop: Header=BB4_1 Depth=1
+; GCN-NEXT:    s_getpc_b64 s[2:3]
+; GCN-NEXT:  .Lpost_getpc3:
+; GCN-NEXT:    s_add_u32 s2, s2, (.LBB4_1-.Lpost_getpc3)&4294967295
+; GCN-NEXT:    s_addc_u32 s3, s3, (.LBB4_1-.Lpost_getpc3)>>32
+; GCN-NEXT:    s_setpc_b64 s[2:3]
+; GCN-NEXT:  .LBB4_2: ; %bb3
+; GCN-NEXT:    s_endpgm
+
+bb:
+  br label %bb2
+
+bb2:
+  %loop.idx = phi i32 [ 0, %bb ], [ %inc, %bb2 ]
+  ; 24 bytes
+  call void asm sideeffect
+  "v_nop_e64
+  v_nop_e64
+  v_nop_e64", ""() #0
+  %inc = add nsw i32 %loop.idx, 1 ; add cost 4
+  %cmp = icmp slt i32 %inc, 10 ; condition cost = 8
+  br i1 %cmp, label %bb2, label %bb3 ; -
+
+bb3:
+  ret void
+}
+
+; Requires expansion of unconditional branch from %bb2 to %bb4 (and
+; expansion of conditional branch from %bb to %bb3.
+
+define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) {
+; GCN-LABEL: uniform_unconditional_min_long_forward_branch:
+; GCN:       ; %bb.0: ; %bb0
+; GCN-NEXT:    s_load_dword s2, s[0:1], 0xb
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_eq_u32 s2, 0
+; GCN-NEXT:    s_mov_b64 s[2:3], -1
+; GCN-NEXT:    s_cbranch_scc0 .LBB5_1
+; GCN-NEXT:  .LBB5_7: ; %bb0
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:  .Lpost_getpc5:
+; GCN-NEXT:    s_add_u32 s4, s4, (.LBB5_4-.Lpost_getpc5)&4294967295
+; GCN-NEXT:    s_addc_u32 s5, s5, (.LBB5_4-.Lpost_getpc5)>>32
+; GCN-NEXT:    s_setpc_b64 s[4:5]
+; GCN-NEXT:  .LBB5_1: ; %Flow
+; GCN-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN-NEXT:    s_cbranch_vccnz .LBB5_3
+; GCN-NEXT:  .LBB5_2: ; %bb2
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 17
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:  .LBB5_3: ; %bb4
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, 63
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
+; GCN-NEXT:  .LBB5_4: ; %bb3
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:  s_mov_b64 vcc, exec
+; GCN-NEXT:    s_cbranch_execnz .LBB5_5
+; GCN-NEXT:  .LBB5_9: ; %bb3
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:  .Lpost_getpc6:
+; GCN-NEXT:    s_add_u32 s4, s4, (.LBB5_2-.Lpost_getpc6)&4294967295
+; GCN-NEXT:    s_addc_u32 s5, s5, (.LBB5_2-.Lpost_getpc6)>>32
+; GCN-NEXT:    s_setpc_b64 s[4:5]
+; GCN-NEXT:  .LBB5_5: ; %bb3
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:  .Lpost_getpc4:
+; GCN-NEXT:    s_add_u32 s4, s4, (.LBB5_3-.Lpost_getpc4)&4294967295
+; GCN-NEXT:    s_addc_u32 s5, s5, (.LBB5_3-.Lpost_getpc4)>>32
+; GCN-NEXT:    s_setpc_b64 s[4:5]
+bb0:
+  %tmp = icmp ne i32 %arg1, 0
+  br i1 %tmp, label %bb2, label %bb3
+
+bb2:
+  store volatile i32 17, ptr addrspace(1) undef
+  br label %bb4
+
+bb3:
+  ; 32 byte asm
+  call void asm sideeffect
+  "v_nop_e64
+  v_nop_e64
+  v_nop_e64
+  v_nop_e64", ""() #0
+  br label %bb4
+
+bb4:
+  store volatile i32 63, ptr addrspace(1) %arg
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
new file mode 100644
index 00000000000000..88e1f26c4cfd32
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
@@ -0,0 +1,529 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-s-branch-bits=5 -stop-after=branch-relaxation  %s -o - | FileCheck %s
+
+; Test long branch reserved register pass when all
+; SGPRs are used
+
+; CHECK-LABEL: {{^}}name: long_branch_used_all_sgprs
+; CHECK: machineFunctionInfo:
+; CHECK-NEXT:   explicitKernArgSize: 12
+; CHECK-NEXT:   maxKernArgAlign: 8
+; CHECK-NEXT:   ldsSize:         0
+; CHECK-NEXT:   gdsSize:         0
+; CHECK-NEXT:   dynLDSAlign:     1
+; CHECK-NEXT:   isEntryFunction: true
+; CHECK-NEXT:   noSignedZerosFPMath: false
+; CHECK-NEXT:   memoryBound:     false
+; CHECK-NEXT:   waveLimiter:     false
+; CHECK-NEXT:   hasSpilledSGPRs: false
+; CHECK-NEXT:   hasSpilledVGPRs: false
+; CHECK-NEXT:   scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+; CHECK-NEXT:   frameOffsetReg:  '$fp_reg'
+; CHECK-NEXT:   stackPtrOffsetReg: '$sgpr32'
+; CHECK-NEXT:   bytesInStackArgArea: 0
+; CHECK-NEXT:   returnsVoid:     true
+; CHECK-NEXT:   argumentInfo:
+; CHECK-NEXT:     kernargSegmentPtr: { reg: '$sgpr0_sgpr1' }
+; CHECK-NEXT:     workGroupIDX:    { reg: '$sgpr2' }
+; CHECK-NEXT:     privateSegmentWaveByteOffset: { reg: '$sgpr3' }
+; CHECK-NEXT:     workItemIDX:     { reg: '$vgpr0' }
+; CHECK-NEXT:   psInputAddr:     0
+; CHECK-NEXT:   psInputEnable:   0
+; CHECK-NEXT:   mode:
+; CHECK-NEXT:     ieee:            true
+; CHECK-NEXT:     dx10-clamp:      true
+; CHECK-NEXT:     fp32-input-denormals: true
+; CHECK-NEXT:     fp32-output-denormals: true
+; CHECK-NEXT:     fp64-fp16-input-denormals: true
+; CHECK-NEXT:     fp64-fp16-output-denormals: true
+; CHECK-NEXT:   highBitsOf32BitAddress: 0
+; CHECK-NEXT:   occupancy:       5
+; CHECK-NEXT:   scavengeFI:      '%fixed-stack.0'
+; CHECK-NEXT:   vgprForAGPRCopy: ''
+; CHECK-NEXT:   longBranchReservedReg: ''
+; CHECK-NEXT: body:
+  define amdgpu_kernel void @long_branch_used_all_sgprs(ptr addrspace(1) %arg, i32 %cnd) #0 {
+  entry:
+    %long_branch_used_all_sgprs.kernarg.segment = call nonnull align 16 dereferenceable(48) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+    %cnd.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %long_branch_used_all_sgprs.kernarg.segment, i64 44, !amdgpu.uniform !0
+    %cnd.load = load i32, ptr addrspace(4) %cnd.kernarg.offset, align 4, !invariant.load !0
+    %sgpr0 = tail call i32 asm sideeffect "s_mov_b32 s0, 0", "={s0}"() #1
+    %sgpr1 = tail call i32 asm sideeffect "s_mov_b32 s1, 0", "={s1}"() #1
+    %sgpr2 = tail call i32 asm sideeffect "s_mov_b32 s2, 0", "={s2}"() #1
+    %sgpr3 = tail call i32 asm sideeffect "s_mov_b32 s3, 0", "={s3}"() #1
+    %sgpr4 = tail call i32 asm sideeffect "s_mov_b32 s4, 0", "={s4}"() #1
+    %sgpr5 = tail call i32 asm sideeffect "s_mov_b32 s5, 0", "={s5}"() #1
+    %sgpr6 = tail call i32 asm sideeffect "s_mov_b32 s6, 0", "={s6}"() #1
+    %sgpr7 = tail call i32 asm sideeffect "s_mov_b32 s7, 0", "={s7}"() #1
+    %sgpr8 = tail call i32 asm sideeffect "s_mov_b32 s8, 0", "={s8}"() #1
+    %sgpr9 = tail call i32 asm sideeffect "s_mov_b32 s9, 0", "={s9}"() #1
+    %sgpr10 = tail call i32 asm sideeffect "s_mov_b32 s10, 0", "={s10}"() #1
+    %sgpr11 = tail call i32 asm sideeffect "s_mov_b32 s11, 0", "={s11}"() #1
+    %sgpr12 = tail call i32 asm sideeffect "s_mov_b32 s12, 0", "={s12}"() #1
+    %sgpr13 = tail call i32 asm sideeffect "s_mov_b32 s13, 0", "={s13}"() #1
+    %sgpr14 = tail call i32 asm sideeffect "s_mov_b32 s14, 0", "={s14}"() #1
+    %sgpr15 = tail call i32 asm sideeffect "s_mov_b32 s15, 0", "={s15}"() #1
+    %sgpr16 = tail call i32 asm sideeffect "s_mov_b32 s16, 0", "={s16}"() #1
+    %sgpr17 = tail call i32 asm sideeffect "s_mov_b32 s17, 0", "={s17}"() #1
+    %sgpr18 = tail call i32 asm sideeffect "s_mov_b32 s18, 0", "={s18}"() #1
+    %sgpr19 = tail call i32 asm sideeffect "s_mov_b32 s19, 0", "={s19}"() #1
+    %sgpr20 = tail call i32 asm sideeffect "s_mov_b32 s20, 0", "={s20}"() #1
+    %sgpr21 = tail call i32 asm sideeffect "s_mov_b32 s21, 0", "={s21}"() #1
+    %sgpr22 = tail call i32 asm sideeffect "s_mov_b32 s22, 0", "={s22}"() #1
+    %sgpr23 = tail call i32 asm sideeffect "s_mov_b32 s23, 0", "={s23}"() #1
+    %sgpr24 = tail call i32 asm sideeffect "s_mov_b32 s24, 0", "={s24}"() #1
+    %sgpr25 = tail call i32 asm sideeffect "s_mov_b32 s25, 0", "={s25}"() #1
+    %sgpr26 = tail call i32 asm sideeffect "s_mov_b32 s26, 0", "={s26}"() #1
+    %sgpr27 = tail call i32 asm sideeffect "s_mov_b32 s27, 0", "={s27}"() #1
+    %sgpr28 = tail call i32 asm sideeffect "s_mov_b32 s28, 0", "={s28}"() #1
+    %sgpr29 = tail call i32 asm sideeffect "s_mov_b32 s29, 0", "={s29}"() #1
+    %sgpr30 = tail call i32 asm sideeffect "s_mov_b32 s30, 0", "={s30}"() #1
+    %sgpr31 = tail call i32 asm sideeffect "s_mov_b32 s31, 0", "={s31}"() #1
+    %sgpr32 = tail call i32 asm sideeffect "s_mov_b32 s32, 0", "={s32}"() #1
+    %sgpr33 = tail call i32 asm sideeffect "s_mov_b32 s33, 0", "={s33}"() #1
+    %sgpr34 = tail call i32 asm sideeffect "s_mov_b32 s34, 0", "={s34}"() #1
+    %sgpr35 = tail call i32 asm sideeffect "s_mov_b32 s35, 0", "={s35}"() #1
+    %sgpr36 = tail call i32 asm sideeffect "s_mov_b32 s36, 0", "={s36}"() #1
+    %sgpr37 = tail call i32 asm sideeffect "s_mov_b32 s37, 0", "={s37}"() #1
+    %sgpr38 = tail call i32 asm sideeffect "s_mov_b32 s38, 0", "={s38}"() #1
+    %sgpr39 = tail call i32 asm sideeffect "s_mov_b32 s39, 0", "={s39}"() #1
+    %sgpr40 = tail call i32 asm sideeffect "s_mov_b32 s40, 0", "={s40}"() #1
+    %sgpr41 = tail call i32 asm sideeffect "s_mov_b32 s41, 0", "={s41}"() #1
+    %sgpr42 = tail call i32 asm sideeffect "s_mov_b32 s42, 0", "={s42}"() #1
+    %sgpr43 = tail call i32 asm sideeffect "s_mov_b32 s43, 0", "={s43}"() #1
+    %sgpr44 = tail call i32 asm sideeffect "s_mov_b32 s44, 0", "={s44}"() #1
+    %sgpr45 = tail call i32 asm sideeffect "s_mov_b32 s45, 0", "={s45}"() #1
+    %sgpr46 = tail call i32 asm sideeffect "s_mov_b32 s46, 0", "={s46}"() #1
+    %sgpr47 = tail call i32 asm sideeffect "s_mov_b32 s47, 0", "={s47}"() #1
+    %sgpr48 = tail call i32 asm sideeffect "s_mov_b32 s48, 0", "={s48}"() #1
+    %sgpr49 = tail call i32 asm sideeffect "s_mov_b32 s49, 0", "={s49}"() #1
+    %sgpr50 = tail call i32 asm sideeffect "s_mov_b32 s50, 0", "={s50}"() #1
+    %sgpr51 = tail call i32 asm sideeffect "s_mov_b32 s51, 0", "={s51}"() #1
+    %sgpr52 = tail call i32 asm sideeffect "s_mov_b32 s52, 0", "={s52}"() #1
+    %sgpr53 = tail call i32 asm sideeffect "s_mov_b32 s53, 0", "={s53}"() #1
+    %sgpr54 = tail call i32 asm sideeffect "s_mov_b32 s54, 0", "={s54}"() #1
+    %sgpr55 = tail call i32 asm sideeffect "s_mov_b32 s55, 0", "={s55}"() #1
+    %sgpr56 = tail call i32 asm sideeffect "s_mov_b32 s56, 0", "={s56}"() #1
+    %sgpr57 = tail call i32 asm sideeffect "s_mov_b32 s57, 0", "={s57}"() #1
+    %sgpr58 = tail call i32 asm sideeffect "s_mov_b32 s58, 0", "={s58}"() #1
+    %sgpr59 = tail call i32 asm sideeffect "s_mov_b32 s59, 0", "={s59}"() #1
+    %sgpr60 = tail call i32 asm sideeffect "s_mov_b32 s60, 0", "={s60}"() #1
+    %sgpr61 = tail call i32 asm sideeffect "s_mov_b32 s61, 0", "={s61}"() #1
+    %sgpr62 = tail call i32 asm sideeffect "s_mov_b32 s62, 0", "={s62}"() #1
+    %sgpr63 = tail call i32 asm sideeffect "s_mov_b32 s63, 0", "={s63}"() #1
+    %sgpr64 = tail call i32 asm sideeffect "s_mov_b32 s64, 0", "={s64}"() #1
+    %sgpr65 = tail call i32 asm sideeffect "s_mov_b32 s65, 0", "={s65}"() #1
+    %sgpr66 = tail call i32 asm sideeffect "s_mov_b32 s66, 0", "={s66}"() #1
+    %sgpr67 = tail call i32 asm sideeffect "s_mov_b32 s67, 0", "={s67}"() #1
+    %sgpr68 = tail call i32 asm sideeffect "s_mov_b32 s68, 0", "={s68}"() #1
+    %sgpr69 = tail call i32 asm sideeffect "s_mov_b32 s69, 0", "={s69}"() #1
+    %sgpr70 = tail call i32 asm sideeffect "s_mov_b32 s70, 0", "={s70}"() #1
+    %sgpr71 = tail call i32 asm sideeffect "s_mov_b32 s71, 0", "={s71}"() #1
+    %sgpr72 = tail call i32 asm sideeffect "s_mov_b32 s72, 0", "={s72}"() #1
+    %sgpr73 = tail call i32 asm sideeffect "s_mov_b32 s73, 0", "={s73}"() #1
+    %sgpr74 = tail call i32 asm sideeffect "s_mov_b32 s74, 0", "={s74}"() #1
+    %sgpr75 = tail call i32 asm sideeffect "s_mov_b32 s75, 0", "={s75}"() #1
+    %sgpr76 = tail call i32 asm sideeffect "s_mov_b32 s76, 0", "={s76}"() #1
+    %sgpr77 = tail call i32 asm sideeffect "s_mov_b32 s77, 0", "={s77}"() #1
+    %sgpr78 = tail call i32 asm sideeffect "s_mov_b32 s78, 0", "={s78}"() #1
+    %sgpr79 = tail call i32 asm sideeffect "s_mov_b32 s79, 0", "={s79}"() #1
+    %sgpr80 = tail call i32 asm sideeffect "s_mov_b32 s80, 0", "={s80}"() #1
+    %sgpr81 = tail call i32 asm sideeffect "s_mov_b32 s81, 0", "={s81}"() #1
+    %sgpr82 = tail call i32 asm sideeffect "s_mov_b32 s82, 0", "={s82}"() #1
+    %sgpr83 = tail call i32 asm sideeffect "s_mov_b32 s83, 0", "={s83}"() #1
+    %sgpr84 = tail call i32 asm sideeffect "s_mov_b32 s84, 0", "={s84}"() #1
+    %sgpr85 = tail call i32 asm sideeffect "s_mov_b32 s85, 0", "={s85}"() #1
+    %sgpr86 = tail call i32 asm sideeffect "s_mov_b32 s86, 0", "={s86}"() #1
+    %sgpr87 = tail call i32 asm sideeffect "s_mov_b32 s87, 0", "={s87}"() #1
+    %sgpr88 = tail call i32 asm sideeffect "s_mov_b32 s88, 0", "={s88}"() #1
+    %sgpr89 = tail call i32 asm sideeffect "s_mov_b32 s89, 0", "={s89}"() #1
+    %sgpr90 = tail call i32 asm sideeffect "s_mov_b32 s90, 0", "={s90}"() #1
+    %sgpr91 = tail call i32 asm sideeffect "s_mov_b32 s91, 0", "={s91}"() #1
+    %sgpr92 = tail call i32 asm sideeffect "s_mov_b32 s92, 0", "={s92}"() #1
+    %sgpr93 = tail call i32 asm sideeffect "s_mov_b32 s93, 0", "={s93}"() #1
+    %sgpr94 = tail call i32 asm sideeffect "s_mov_b32 s94, 0", "={s94}"() #1
+    %sgpr95 = tail call i32 asm sideeffect "s_mov_b32 s95, 0", "={s95}"() #1
+    %sgpr96 = tail call i32 asm sideeffect "s_mov_b32 s96, 0", "={s96}"() #1
+    %sgpr97 = tail call i32 asm sideeffect "s_mov_b32 s97, 0", "={s97}"() #1
+    %sgpr98 = tail call i32 asm sideeffect "s_mov_b32 s98, 0", "={s98}"() #1
+    %sgpr99 = tail call i32 asm sideeffect "s_mov_b32 s99, 0", "={s99}"() #1
+    %sgpr100 = tail call i32 asm sideeffect "s_mov_b32 s100, 0", "={s100}"() #1
+    %sgpr101 = tail call i32 asm sideeffect "s_mov_b32 s101, 0", "={s101}"() #1
+    %vcc_lo = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_lo}"() #1
+    %vcc_hi = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_hi}"() #1
+    %cmp = icmp ne i32 %cnd.load, 0
+    br i1 %cmp, label %bb2, label %bb3, !amdgpu.uniform !0
+
+  bb2:                                              ; preds = %entry
+    call void asm sideeffect "v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", ""() #1
+    br label %bb3, !amdgpu.uniform !0
+
+  bb3:                                              ; preds = %bb2, %entry
+    tail call void asm sideeffect "; reg use $0", "{s0}"(i32 %sgpr0) #1
+    tail call void asm sideeffect "; reg use $0", "{s1}"(i32 %sgpr1) #1
+    tail call void asm sideeffect "; reg use $0", "{s2}"(i32 %sgpr2) #1
+    tail call void asm sideeffect "; reg use $0", "{s3}"(i32 %sgpr3) #1
+    tail call void asm sideeffect "; reg use $0", "{s4}"(i32 %sgpr4) #1
+    tail call void asm sideeffect "; reg use $0", "{s5}"(i32 %sgpr5) #1
+    tail call void asm sideeffect "; reg use $0", "{s6}"(i32 %sgpr6) #1
+    tail call void asm sideeffect "; reg use $0", "{s7}"(i32 %sgpr7) #1
+    tail call void asm sideeffect "; reg use $0", "{s8}"(i32 %sgpr8) #1
+    tail call void asm sideeffect "; reg use $0", "{s9}"(i32 %sgpr9) #1
+    tail call void asm sideeffect "; reg use $0", "{s10}"(i32 %sgpr10) #1
+    tail call void asm sideeffect "; reg use $0", "{s11}"(i32 %sgpr11) #1
+    tail call void asm sideeffect "; reg use $0", "{s12}"(i32 %sgpr12) #1
+    tail call void asm sideeffect "; reg use $0", "{s13}"(i32 %sgpr13) #1
+    tail call void asm sideeffect "; reg use $0", "{s14}"(i32 %sgpr14) #1
+    tail call void asm sideeffect "; reg use $0", "{s15}"(i32 %sgpr15) #1
+    tail call void asm sideeffect "; reg use $0", "{s16}"(i32 %sgpr16) #1
+    tail call void asm sideeffect "; reg use $0", "{s17}"(i32 %sgpr17) #1
+    tail call void asm sideeffect "; reg use $0", "{s18}"(i32 %sgpr18) #1
+    tail call void asm sideeffect "; reg use $0", "{s19}"(i32 %sgpr19) #1
+    tail call void asm sideeffect "; reg use $0", "{s20}"(i32 %sgpr20) #1
+    tail call void asm sideeffect "; reg use $0", "{s21}"(i32 %sgpr21) #1
+    tail call void asm sideeffect "; reg use $0", "{s22}"(i32 %sgpr22) #1
+    tail call void asm sideeffect "; reg use $0", "{s23}"(i32 %sgpr23) #1
+    tail call void asm sideeffect "; reg use $0", "{s24}"(i32 %sgpr24) #1
+    tail call void asm sideeffect "; reg use $0", "{s25}"(i32 %sgpr25) #1
+    tail call void asm sideeffect "; reg use $0", "{s26}"(i32 %sgpr26) #1
+    tail call void asm sideeffect "; reg use $0", "{s27}"(i32 %sgpr27) #1
+    tail call void asm sideeffect "; reg use $0", "{s28}"(i32 %sgpr28) #1
+    tail call void asm sideeffect "; reg use $0", "{s29}"(i32 %sgpr29) #1
+    tail call void asm sideeffect "; reg use $0", "{s30}"(i32 %sgpr30) #1
+    tail call void asm sideeffect "; reg use $0", "{s31}"(i32 %sgpr31) #1
+    tail call void asm sideeffect "; reg use $0", "{s32}"(i32 %sgpr32) #1
+    tail call void asm sideeffect "; reg use $0", "{s33}"(i32 %sgpr33) #1
+    tail call void asm sideeffect "; reg use $0", "{s34}"(i32 %sgpr34) #1
+    tail call void asm sideeffect "; reg use $0", "{s35}"(i32 %sgpr35) #1
+    tail call void asm sideeffect "; reg use $0", "{s36}"(i32 %sgpr36) #1
+    tail call void asm sideeffect "; reg use $0", "{s37}"(i32 %sgpr37) #1
+    tail call void asm sideeffect "; reg use $0", "{s38}"(i32 %sgpr38) #1
+    tail call void asm sideeffect "; reg use $0", "{s39}"(i32 %sgpr39) #1
+    tail call void asm sideeffect "; reg use $0", "{s40}"(i32 %sgpr40) #1
+    tail call void asm sideeffect "; reg use $0", "{s41}"(i32 %sgpr41) #1
+    tail call void asm sideeffect "; reg use $0", "{s42}"(i32 %sgpr42) #1
+    tail call void asm sideeffect "; reg use $0", "{s43}"(i32 %sgpr43) #1
+    tail call void asm sideeffect "; reg use $0", "{s44}"(i32 %sgpr44) #1
+    tail call void asm sideeffect "; reg use $0", "{s45}"(i32 %sgpr45) #1
+    tail call void asm sideeffect "; reg use $0", "{s46}"(i32 %sgpr46) #1
+    tail call void asm sideeffect "; reg use $0", "{s47}"(i32 %sgpr47) #1
+    tail call void asm sideeffect "; reg use $0", "{s48}"(i32 %sgpr48) #1
+    tail call void asm sideeffect "; reg use $0", "{s49}"(i32 %sgpr49) #1
+    tail call void asm sideeffect "; reg use $0", "{s50}"(i32 %sgpr50) #1
+    tail call void asm sideeffect "; reg use $0", "{s51}"(i32 %sgpr51) #1
+    tail call void asm sideeffect "; reg use $0", "{s52}"(i32 %sgpr52) #1
+    tail call void asm sideeffect "; reg use $0", "{s53}"(i32 %sgpr53) #1
+    tail call void asm sideeffect "; reg use $0", "{s54}"(i32 %sgpr54) #1
+    tail call void asm sideeffect "; reg use $0", "{s55}"(i32 %sgpr55) #1
+    tail call void asm sideeffect "; reg use $0", "{s56}"(i32 %sgpr56) #1
+    tail call void asm sideeffect "; reg use $0", "{s57}"(i32 %sgpr57) #1
+    tail call void asm sideeffect "; reg use $0", "{s58}"(i32 %sgpr58) #1
+    tail call void asm sideeffect "; reg use $0", "{s59}"(i32 %sgpr59) #1
+    tail call void asm sideeffect "; reg use $0", "{s60}"(i32 %sgpr60) #1
+    tail call void asm sideeffect "; reg use $0", "{s61}"(i32 %sgpr61) #1
+    tail call void asm sideeffect "; reg use $0", "{s62}"(i32 %sgpr62) #1
+    tail call void asm sideeffect "; reg use $0", "{s63}"(i32 %sgpr63) #1
+    tail call void asm sideeffect "; reg use $0", "{s64}"(i32 %sgpr64) #1
+    tail call void asm sideeffect "; reg use $0", "{s65}"(i32 %sgpr65) #1
+    tail call void asm sideeffect "; reg use $0", "{s66}"(i32 %sgpr66) #1
+    tail call void asm sideeffect "; reg use $0", "{s67}"(i32 %sgpr67) #1
+    tail call void asm sideeffect "; reg use $0", "{s68}"(i32 %sgpr68) #1
+    tail call void asm sideeffect "; reg use $0", "{s69}"(i32 %sgpr69) #1
+    tail call void asm sideeffect "; reg use $0", "{s70}"(i32 %sgpr70) #1
+    tail call void asm sideeffect "; reg use $0", "{s71}"(i32 %sgpr71) #1
+    tail call void asm sideeffect "; reg use $0", "{s72}"(i32 %sgpr72) #1
+    tail call void asm sideeffect "; reg use $0", "{s73}"(i32 %sgpr73) #1
+    tail call void asm sideeffect "; reg use $0", "{s74}"(i32 %sgpr74) #1
+    tail call void asm sideeffect "; reg use $0", "{s75}"(i32 %sgpr75) #1
+    tail call void asm sideeffect "; reg use $0", "{s76}"(i32 %sgpr76) #1
+    tail call void asm sideeffect "; reg use $0", "{s77}"(i32 %sgpr77) #1
+    tail call void asm sideeffect "; reg use $0", "{s78}"(i32 %sgpr78) #1
+    tail call void asm sideeffect "; reg use $0", "{s79}"(i32 %sgpr79) #1
+    tail call void asm sideeffect "; reg use $0", "{s80}"(i32 %sgpr80) #1
+    tail call void asm sideeffect "; reg use $0", "{s81}"(i32 %sgpr81) #1
+    tail call void asm sideeffect "; reg use $0", "{s82}"(i32 %sgpr82) #1
+    tail call void asm sideeffect "; reg use $0", "{s83}"(i32 %sgpr83) #1
+    tail call void asm sideeffect "; reg use $0", "{s84}"(i32 %sgpr84) #1
+    tail call void asm sideeffect "; reg use $0", "{s85}"(i32 %sgpr85) #1
+    tail call void asm sideeffect "; reg use $0", "{s86}"(i32 %sgpr86) #1
+    tail call void asm sideeffect "; reg use $0", "{s87}"(i32 %sgpr87) #1
+    tail call void asm sideeffect "; reg use $0", "{s88}"(i32 %sgpr88) #1
+    tail call void asm sideeffect "; reg use $0", "{s89}"(i32 %sgpr89) #1
+    tail call void asm sideeffect "; reg use $0", "{s90}"(i32 %sgpr90) #1
+    tail call void asm sideeffect "; reg use $0", "{s91}"(i32 %sgpr91) #1
+    tail call void asm sideeffect "; reg use $0", "{s92}"(i32 %sgpr92) #1
+    tail call void asm sideeffect "; reg use $0", "{s93}"(i32 %sgpr93) #1
+    tail call void asm sideeffect "; reg use $0", "{s94}"(i32 %sgpr94) #1
+    tail call void asm sideeffect "; reg use $0", "{s95}"(i32 %sgpr95) #1
+    tail call void asm sideeffect "; reg use $0", "{s96}"(i32 %sgpr96) #1
+    tail call void asm sideeffect "; reg use $0", "{s97}"(i32 %sgpr97) #1
+    tail call void asm sideeffect "; reg use $0", "{s98}"(i32 %sgpr98) #1
+    tail call void asm sideeffect "; reg use $0", "{s99}"(i32 %sgpr99) #1
+    tail call void asm sideeffect "; reg use $0", "{s100}"(i32 %sgpr100) #1
+    tail call void asm sideeffect "; reg use $0", "{s101}"(i32 %sgpr101) #1
+    tail call void asm sideeffect "; reg use $0", "{vcc_lo}"(i32 %vcc_lo) #1
+    tail call void asm sideeffect "; reg use $0", "{vcc_hi}"(i32 %vcc_hi) #1
+    ret void
+  }
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+; CHECK-LABEL: {{^}}name: long_branch_high_num_sgprs_used
+; CHECK: machineFunctionInfo:
+; CHECK-NEXT:   explicitKernArgSize: 12
+; CHECK-NEXT:   maxKernArgAlign: 8
+; CHECK-NEXT:   ldsSize:         0
+; CHECK-NEXT:   gdsSize:         0
+; CHECK-NEXT:   dynLDSAlign:     1
+; CHECK-NEXT:   isEntryFunction: true
+; CHECK-NEXT:   noSignedZerosFPMath: false
+; CHECK-NEXT:   memoryBound:     false
+; CHECK-NEXT:   waveLimiter:     false
+; CHECK-NEXT:   hasSpilledSGPRs: false
+; CHECK-NEXT:   hasSpilledVGPRs: false
+; CHECK-NEXT:   scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+; CHECK-NEXT:   frameOffsetReg:  '$fp_reg'
+; CHECK-NEXT:   stackPtrOffsetReg: '$sgpr32'
+; CHECK-NEXT:   bytesInStackArgArea: 0
+; CHECK-NEXT:   returnsVoid:     true
+; CHECK-NEXT:   argumentInfo:
+; CHECK-NEXT:     kernargSegmentPtr: { reg: '$sgpr0_sgpr1' }
+; CHECK-NEXT:     workGroupIDX:    { reg: '$sgpr2' }
+; CHECK-NEXT:     privateSegmentWaveByteOffset: { reg: '$sgpr3' }
+; CHECK-NEXT:     workItemIDX:     { reg: '$vgpr0' }
+; CHECK-NEXT:   psInputAddr:     0
+; CHECK-NEXT:   psInputEnable:   0
+; CHECK-NEXT:   mode:
+; CHECK-NEXT:     ieee:            true
+; CHECK-NEXT:     dx10-clamp:      true
+; CHECK-NEXT:     fp32-input-denormals: true
+; CHECK-NEXT:     fp32-output-denormals: true
+; CHECK-NEXT:     fp64-fp16-input-denormals: true
+; CHECK-NEXT:     fp64-fp16-output-denormals: true
+; CHECK-NEXT:   highBitsOf32BitAddress: 0
+; CHECK-NEXT:   occupancy:       5
+; CHECK-NEXT:   vgprForAGPRCopy: ''
+; CHECK-NEXT:   longBranchReservedReg: '$sgpr100_sgpr101'
+; CHECK-NEXT: body:
+  define amdgpu_kernel void @long_branch_high_num_sgprs_used(ptr addrspace(1) %arg, i32 %cnd) #0 {
+  entry:
+    %long_branch_used_all_sgprs.kernarg.segment = call nonnull align 16 dereferenceable(48) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+    %cnd.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %long_branch_used_all_sgprs.kernarg.segment, i64 44, !amdgpu.uniform !0
+    %cnd.load = load i32, ptr addrspace(4) %cnd.kernarg.offset, align 4, !invariant.load !0
+    %sgpr0 = tail call i32 asm sideeffect "s_mov_b32 s0, 0", "={s0}"() #1
+    %sgpr1 = tail call i32 asm sideeffect "s_mov_b32 s1, 0", "={s1}"() #1
+    %sgpr2 = tail call i32 asm sideeffect "s_mov_b32 s2, 0", "={s2}"() #1
+    %sgpr3 = tail call i32 asm sideeffect "s_mov_b32 s3, 0", "={s3}"() #1
+    %sgpr4 = tail call i32 asm sideeffect "s_mov_b32 s4, 0", "={s4}"() #1
+    %sgpr5 = tail call i32 asm sideeffect "s_mov_b32 s5, 0", "={s5}"() #1
+    %sgpr6 = tail call i32 asm sideeffect "s_mov_b32 s6, 0", "={s6}"() #1
+    %sgpr7 = tail call i32 asm sideeffect "s_mov_b32 s7, 0", "={s7}"() #1
+    %sgpr8 = tail call i32 asm sideeffect "s_mov_b32 s8, 0", "={s8}"() #1
+    %sgpr9 = tail call i32 asm sideeffect "s_mov_b32 s9, 0", "={s9}"() #1
+    %sgpr10 = tail call i32 asm sideeffect "s_mov_b32 s10, 0", "={s10}"() #1
+    %sgpr11 = tail call i32 asm sideeffect "s_mov_b32 s11, 0", "={s11}"() #1
+    %sgpr12 = tail call i32 asm sideeffect "s_mov_b32 s12, 0", "={s12}"() #1
+    %sgpr13 = tail call i32 asm sideeffect "s_mov_b32 s13, 0", "={s13}"() #1
+    %sgpr14 = tail call i32 asm sideeffect "s_mov_b32 s14, 0", "={s14}"() #1
+    %sgpr15 = tail call i32 asm sideeffect "s_mov_b32 s15, 0", "={s15}"() #1
+    %sgpr16 = tail call i32 asm sideeffect "s_mov_b32 s16, 0", "={s16}"() #1
+    %sgpr17 = tail call i32 asm sideeffect "s_mov_b32 s17, 0", "={s17}"() #1
+    %sgpr18 = tail call i32 asm sideeffect "s_mov_b32 s18, 0", "={s18}"() #1
+    %sgpr19 = tail call i32 asm sideeffect "s_mov_b32 s19, 0", "={s19}"() #1
+    %sgpr20 = tail call i32 asm sideeffect "s_mov_b32 s20, 0", "={s20}"() #1
+    %sgpr21 = tail call i32 asm sideeffect "s_mov_b32 s21, 0", "={s21}"() #1
+    %sgpr22 = tail call i32 asm sideeffect "s_mov_b32 s22, 0", "={s22}"() #1
+    %sgpr23 = tail call i32 asm sideeffect "s_mov_b32 s23, 0", "={s23}"() #1
+    %sgpr24 = tail call i32 asm sideeffect "s_mov_b32 s24, 0", "={s24}"() #1
+    %sgpr25 = tail call i32 asm sideeffect "s_mov_b32 s25, 0", "={s25}"() #1
+    %sgpr26 = tail call i32 asm sideeffect "s_mov_b32 s26, 0", "={s26}"() #1
+    %sgpr27 = tail call i32 asm sideeffect "s_mov_b32 s27, 0", "={s27}"() #1
+    %sgpr28 = tail call i32 asm sideeffect "s_mov_b32 s28, 0", "={s28}"() #1
+    %sgpr29 = tail call i32 asm sideeffect "s_mov_b32 s29, 0", "={s29}"() #1
+    %sgpr30 = tail call i32 asm sideeffect "s_mov_b32 s30, 0", "={s30}"() #1
+    %sgpr31 = tail call i32 asm sideeffect "s_mov_b32 s31, 0", "={s31}"() #1
+    %sgpr32 = tail call i32 asm sideeffect "s_mov_b32 s32, 0", "={s32}"() #1
+    %sgpr33 = tail call i32 asm sideeffect "s_mov_b32 s33, 0", "={s33}"() #1
+    %sgpr34 = tail call i32 asm sideeffect "s_mov_b32 s34, 0", "={s34}"() #1
+    %sgpr35 = tail call i32 asm sideeffect "s_mov_b32 s35, 0", "={s35}"() #1
+    %sgpr36 = tail call i32 asm sideeffect "s_mov_b32 s36, 0", "={s36}"() #1
+    %sgpr37 = tail call i32 asm sideeffect "s_mov_b32 s37, 0", "={s37}"() #1
+    %sgpr38 = tail call i32 asm sideeffect "s_mov_b32 s38, 0", "={s38}"() #1
+    %sgpr39 = tail call i32 asm sideeffect "s_mov_b32 s39, 0", "={s39}"() #1
+    %sgpr40 = tail call i32 asm sideeffect "s_mov_b32 s40, 0", "={s40}"() #1
+    %sgpr41 = tail call i32 asm sideeffect "s_mov_b32 s41, 0", "={s41}"() #1
+    %sgpr42 = tail call i32 asm sideeffect "s_mov_b32 s42, 0", "={s42}"() #1
+    %sgpr43 = tail call i32 asm sideeffect "s_mov_b32 s43, 0", "={s43}"() #1
+    %sgpr44 = tail call i32 asm sideeffect "s_mov_b32 s44, 0", "={s44}"() #1
+    %sgpr45 = tail call i32 asm sideeffect "s_mov_b32 s45, 0", "={s45}"() #1
+    %sgpr46 = tail call i32 asm sideeffect "s_mov_b32 s46, 0", "={s46}"() #1
+    %sgpr47 = tail call i32 asm sideeffect "s_mov_b32 s47, 0", "={s47}"() #1
+    %sgpr48 = tail call i32 asm sideeffect "s_mov_b32 s48, 0", "={s48}"() #1
+    %sgpr49 = tail call i32 asm sideeffect "s_mov_b32 s49, 0", "={s49}"() #1
+    %sgpr50 = tail call i32 asm sideeffect "s_mov_b32 s50, 0", "={s50}"() #1
+    %sgpr51 = tail call i32 asm sideeffect "s_mov_b32 s51, 0", "={s51}"() #1
+    %sgpr52 = tail call i32 asm sideeffect "s_mov_b32 s52, 0", "={s52}"() #1
+    %sgpr53 = tail call i32 asm sideeffect "s_mov_b32 s53, 0", "={s53}"() #1
+    %sgpr54 = tail call i32 asm sideeffect "s_mov_b32 s54, 0", "={s54}"() #1
+    %sgpr55 = tail call i32 asm sideeffect "s_mov_b32 s55, 0", "={s55}"() #1
+    %sgpr56 = tail call i32 asm sideeffect "s_mov_b32 s56, 0", "={s56}"() #1
+    %sgpr57 = tail call i32 asm sideeffect "s_mov_b32 s57, 0", "={s57}"() #1
+    %sgpr58 = tail call i32 asm sideeffect "s_mov_b32 s58, 0", "={s58}"() #1
+    %sgpr59 = tail call i32 asm sideeffect "s_mov_b32 s59, 0", "={s59}"() #1
+    %sgpr60 = tail call i32 asm sideeffect "s_mov_b32 s60, 0", "={s60}"() #1
+    %sgpr61 = tail call i32 asm sideeffect "s_mov_b32 s61, 0", "={s61}"() #1
+    %sgpr62 = tail call i32 asm sideeffect "s_mov_b32 s62, 0", "={s62}"() #1
+    %sgpr63 = tail call i32 asm sideeffect "s_mov_b32 s63, 0", "={s63}"() #1
+    %sgpr64 = tail call i32 asm sideeffect "s_mov_b32 s64, 0", "={s64}"() #1
+    %sgpr65 = tail call i32 asm sideeffect "s_mov_b32 s65, 0", "={s65}"() #1
+    %sgpr66 = tail call i32 asm sideeffect "s_mov_b32 s66, 0", "={s66}"() #1
+    %sgpr67 = tail call i32 asm sideeffect "s_mov_b32 s67, 0", "={s67}"() #1
+    %sgpr68 = tail call i32 asm sideeffect "s_mov_b32 s68, 0", "={s68}"() #1
+    %sgpr69 = tail call i32 asm sideeffect "s_mov_b32 s69, 0", "={s69}"() #1
+    %sgpr70 = tail call i32 asm sideeffect "s_mov_b32 s70, 0", "={s70}"() #1
+    %sgpr71 = tail call i32 asm sideeffect "s_mov_b32 s71, 0", "={s71}"() #1
+    %sgpr72 = tail call i32 asm sideeffect "s_mov_b32 s72, 0", "={s72}"() #1
+    %sgpr73 = tail call i32 asm sideeffect "s_mov_b32 s73, 0", "={s73}"() #1
+    %sgpr74 = tail call i32 asm sideeffect "s_mov_b32 s74, 0", "={s74}"() #1
+    %sgpr75 = tail call i32 asm sideeffect "s_mov_b32 s75, 0", "={s75}"() #1
+    %sgpr76 = tail call i32 asm sideeffect "s_mov_b32 s76, 0", "={s76}"() #1
+    %sgpr77 = tail call i32 asm sideeffect "s_mov_b32 s77, 0", "={s77}"() #1
+    %sgpr78 = tail call i32 asm sideeffect "s_mov_b32 s78, 0", "={s78}"() #1
+    %sgpr79 = tail call i32 asm sideeffect "s_mov_b32 s79, 0", "={s79}"() #1
+    %sgpr80 = tail call i32 asm sideeffect "s_mov_b32 s80, 0", "={s80}"() #1
+    %sgpr81 = tail call i32 asm sideeffect "s_mov_b32 s81, 0", "={s81}"() #1
+    %sgpr82 = tail call i32 asm sideeffect "s_mov_b32 s82, 0", "={s82}"() #1
+    %sgpr83 = tail call i32 asm sideeffect "s_mov_b32 s83, 0", "={s83}"() #1
+    %sgpr84 = tail call i32 asm sideeffect "s_mov_b32 s84, 0", "={s84}"() #1
+    %sgpr85 = tail call i32 asm sideeffect "s_mov_b32 s85, 0", "={s85}"() #1
+    %sgpr86 = tail call i32 asm sideeffect "s_mov_b32 s86, 0", "={s86}"() #1
+    %sgpr87 = tail call i32 asm sideeffect "s_mov_b32 s87, 0", "={s87}"() #1
+    %sgpr88 = tail call i32 asm sideeffect "s_mov_b32 s88, 0", "={s88}"() #1
+    %sgpr89 = tail call i32 asm sideeffect "s_mov_b32 s89, 0", "={s89}"() #1
+    %sgpr90 = tail call i32 asm sideeffect "s_mov_b32 s90, 0", "={s90}"() #1
+    %sgpr91 = tail call i32 asm sideeffect "s_mov_b32 s91, 0", "={s91}"() #1
+    %sgpr92 = tail call i32 asm sideeffect "s_mov_b32 s92, 0", "={s92}"() #1
+    %sgpr93 = tail call i32 asm sideeffect "s_mov_b32 s93, 0", "={s93}"() #1
+    %sgpr94 = tail call i32 asm sideeffect "s_mov_b32 s94, 0", "={s94}"() #1
+    %sgpr95 = tail call i32 asm sideeffect "s_mov_b32 s95, 0", "={s95}"() #1
+    %sgpr96 = tail call i32 asm sideeffect "s_mov_b32 s96, 0", "={s96}"() #1
+    %sgpr97 = tail call i32 asm sideeffect "s_mov_b32 s97, 0", "={s97}"() #1
+    %sgpr98 = tail call i32 asm sideeffect "s_mov_b32 s98, 0", "={s98}"() #1
+    %sgpr99 = tail call i32 asm sideeffect "s_mov_b32 s99, 0", "={s99}"() #1
+    %vcc_lo = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_lo}"() #1
+    %vcc_hi = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_hi}"() #1
+    %cmp = icmp ne i32 %cnd.load, 0
+    br i1 %cmp, label %bb2, label %bb3, !amdgpu.uniform !0
+
+  bb2:                                              ; preds = %entry
+    call void asm sideeffect "v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", ""() #1
+    br label %bb3, !amdgpu.uniform !0
+
+  bb3:                                              ; preds = %bb2, %entry
+    tail call void asm sideeffect "; reg use $0", "{s0}"(i32 %sgpr0) #1
+    tail call void asm sideeffect "; reg use $0", "{s1}"(i32 %sgpr1) #1
+    tail call void asm sideeffect "; reg use $0", "{s2}"(i32 %sgpr2) #1
+    tail call void asm sideeffect "; reg use $0", "{s3}"(i32 %sgpr3) #1
+    tail call void asm sideeffect "; reg use $0", "{s4}"(i32 %sgpr4) #1
+    tail call void asm sideeffect "; reg use $0", "{s5}"(i32 %sgpr5) #1
+    tail call void asm sideeffect "; reg use $0", "{s6}"(i32 %sgpr6) #1
+    tail call void asm sideeffect "; reg use $0", "{s7}"(i32 %sgpr7) #1
+    tail call void asm sideeffect "; reg use $0", "{s8}"(i32 %sgpr8) #1
+    tail call void asm sideeffect "; reg use $0", "{s9}"(i32 %sgpr9) #1
+    tail call void asm sideeffect "; reg use $0", "{s10}"(i32 %sgpr10) #1
+    tail call void asm sideeffect "; reg use $0", "{s11}"(i32 %sgpr11) #1
+    tail call void asm sideeffect "; reg use $0", "{s12}"(i32 %sgpr12) #1
+    tail call void asm sideeffect "; reg use $0", "{s13}"(i32 %sgpr13) #1
+    tail call void asm sideeffect "; reg use $0", "{s14}"(i32 %sgpr14) #1
+    tail call void asm sideeffect "; reg use $0", "{s15}"(i32 %sgpr15) #1
+    tail call void asm sideeffect "; reg use $0", "{s16}"(i32 %sgpr16) #1
+    tail call void asm sideeffect "; reg use $0", "{s17}"(i32 %sgpr17) #1
+    tail call void asm sideeffect "; reg use $0", "{s18}"(i32 %sgpr18) #1
+    tail call void asm sideeffect "; reg use $0", "{s19}"(i32 %sgpr19) #1
+    tail call void asm sideeffect "; reg use $0", "{s20}"(i32 %sgpr20) #1
+    tail call void asm sideeffect "; reg use $0", "{s21}"(i32 %sgpr21) #1
+    tail call void asm sideeffect "; reg use $0", "{s22}"(i32 %sgpr22) #1
+    tail call void asm sideeffect "; reg use $0", "{s23}"(i32 %sgpr23) #1
+    tail call void asm sideeffect "; reg use $0", "{s24}"(i32 %sgpr24) #1
+    tail call void asm sideeffect "; reg use $0", "{s25}"(i32 %sgpr25) #1
+    tail call void asm sideeffect "; reg use $0", "{s26}"(i32 %sgpr26) #1
+    tail call void asm sideeffect "; reg use $0", "{s27}"(i32 %sgpr27) #1
+    tail call void asm sideeffect "; reg use $0", "{s28}"(i32 %sgpr28) #1
+    tail call void asm sideeffect "; reg use $0", "{s29}"(i32 %sgpr29) #1
+    tail call void asm sideeffect "; reg use $0", "{s30}"(i32 %sgpr30) #1
+    tail call void asm sideeffect "; reg use $0", "{s31}"(i32 %sgpr31) #1
+    tail call void asm sideeffect "; reg use $0", "{s32}"(i32 %sgpr32) #1
+    tail call void asm sideeffect "; reg use $0", "{s33}"(i32 %sgpr33) #1
+    tail call void asm sideeffect "; reg use $0", "{s34}"(i32 %sgpr34) #1
+    tail call void asm sideeffect "; reg use $0", "{s35}"(i32 %sgpr35) #1
+    tail call void asm sideeffect "; reg use $0", "{s36}"(i32 %sgpr36) #1
+    tail call void asm sideeffect "; reg use $0", "{s37}"(i32 %sgpr37) #1
+    tail call void asm sideeffect "; reg use $0", "{s38}"(i32 %sgpr38) #1
+    tail call void asm sideeffect "; reg use $0", "{s39}"(i32 %sgpr39) #1
+    tail call void asm sideeffect "; reg use $0", "{s40}"(i32 %sgpr40) #1
+    tail call void asm sideeffect "; reg use $0", "{s41}"(i32 %sgpr41) #1
+    tail call void asm sideeffect "; reg use $0", "{s42}"(i32 %sgpr42) #1
+    tail call void asm sideeffect "; reg use $0", "{s43}"(i32 %sgpr43) #1
+    tail call void asm sideeffect "; reg use $0", "{s44}"(i32 %sgpr44) #1
+    tail call void asm sideeffect "; reg use $0", "{s45}"(i32 %sgpr45) #1
+    tail call void asm sideeffect "; reg use $0", "{s46}"(i32 %sgpr46) #1
+    tail call void asm sideeffect "; reg use $0", "{s47}"(i32 %sgpr47) #1
+    tail call void asm sideeffect "; reg use $0", "{s48}"(i32 %sgpr48) #1
+    tail call void asm sideeffect "; reg use $0", "{s49}"(i32 %sgpr49) #1
+    tail call void asm sideeffect "; reg use $0", "{s50}"(i32 %sgpr50) #1
+    tail call void asm sideeffect "; reg use $0", "{s51}"(i32 %sgpr51) #1
+    tail call void asm sideeffect "; reg use $0", "{s52}"(i32 %sgpr52) #1
+    tail call void asm sideeffect "; reg use $0", "{s53}"(i32 %sgpr53) #1
+    tail call void asm sideeffect "; reg use $0", "{s54}"(i32 %sgpr54) #1
+    tail call void asm sideeffect "; reg use $0", "{s55}"(i32 %sgpr55) #1
+    tail call void asm sideeffect "; reg use $0", "{s56}"(i32 %sgpr56) #1
+    tail call void asm sideeffect "; reg use $0", "{s57}"(i32 %sgpr57) #1
+    tail call void asm sideeffect "; reg use $0", "{s58}"(i32 %sgpr58) #1
+    tail call void asm sideeffect "; reg use $0", "{s59}"(i32 %sgpr59) #1
+    tail call void asm sideeffect "; reg use $0", "{s60}"(i32 %sgpr60) #1
+    tail call void asm sideeffect "; reg use $0", "{s61}"(i32 %sgpr61) #1
+    tail call void asm sideeffect "; reg use $0", "{s62}"(i32 %sgpr62) #1
+    tail call void asm sideeffect "; reg use $0", "{s63}"(i32 %sgpr63) #1
+    tail call void asm sideeffect "; reg use $0", "{s64}"(i32 %sgpr64) #1
+    tail call void asm sideeffect "; reg use $0", "{s65}"(i32 %sgpr65) #1
+    tail call void asm sideeffect "; reg use $0", "{s66}"(i32 %sgpr66) #1
+    tail call void asm sideeffect "; reg use $0", "{s67}"(i32 %sgpr67) #1
+    tail call void asm sideeffect "; reg use $0", "{s68}"(i32 %sgpr68) #1
+    tail call void asm sideeffect "; reg use $0", "{s69}"(i32 %sgpr69) #1
+    tail call void asm sideeffect "; reg use $0", "{s70}"(i32 %sgpr70) #1
+    tail call void asm sideeffect "; reg use $0", "{s71}"(i32 %sgpr71) #1
+    tail call void asm sideeffect "; reg use $0", "{s72}"(i32 %sgpr72) #1
+    tail call void asm sideeffect "; reg use $0", "{s73}"(i32 %sgpr73) #1
+    tail call void asm sideeffect "; reg use $0", "{s74}"(i32 %sgpr74) #1
+    tail call void asm sideeffect "; reg use $0", "{s75}"(i32 %sgpr75) #1
+    tail call void asm sideeffect "; reg use $0", "{s76}"(i32 %sgpr76) #1
+    tail call void asm sideeffect "; reg use $0", "{s77}"(i32 %sgpr77) #1
+    tail call void asm sideeffect "; reg use $0", "{s78}"(i32 %sgpr78) #1
+    tail call void asm sideeffect "; reg use $0", "{s79}"(i32 %sgpr79) #1
+    tail call void asm sideeffect "; reg use $0", "{s80}"(i32 %sgpr80) #1
+    tail call void asm sideeffect "; reg use $0", "{s81}"(i32 %sgpr81) #1
+    tail call void asm sideeffect "; reg use $0", "{s82}"(i32 %sgpr82) #1
+    tail call void asm sideeffect "; reg use $0", "{s83}"(i32 %sgpr83) #1
+    tail call void asm sideeffect "; reg use $0", "{s84}"(i32 %sgpr84) #1
+    tail call void asm sideeffect "; reg use $0", "{s85}"(i32 %sgpr85) #1
+    tail call void asm sideeffect "; reg use $0", "{s86}"(i32 %sgpr86) #1
+    tail call void asm sideeffect "; reg use $0", "{s87}"(i32 %sgpr87) #1
+    tail call void asm sideeffect "; reg use $0", "{s88}"(i32 %sgpr88) #1
+    tail call void asm sideeffect "; reg use $0", "{s89}"(i32 %sgpr89) #1
+    tail call void asm sideeffect "; reg use $0", "{s90}"(i32 %sgpr90) #1
+    tail call void asm sideeffect "; reg use $0", "{s91}"(i32 %sgpr91) #1
+    tail call void asm sideeffect "; reg use $0", "{s92}"(i32 %sgpr92) #1
+    tail call void asm sideeffect "; reg use $0", "{s93}"(i32 %sgpr93) #1
+    tail call void asm sideeffect "; reg use $0", "{s94}"(i32 %sgpr94) #1
+    tail call void asm sideeffect "; reg use $0", "{s95}"(i32 %sgpr95) #1
+    tail call void asm sideeffect "; reg use $0", "{s96}"(i32 %sgpr96) #1
+    tail call void asm sideeffect "; reg use $0", "{s97}"(i32 %sgpr97) #1
+    tail call void asm sideeffect "; reg use $0", "{s98}"(i32 %sgpr98) #1
+    tail call void asm sideeffect "; reg use $0", "{s99}"(i32 %sgpr99) #1
+    tail call void asm sideeffect "; reg use $0", "{vcc_lo}"(i32 %vcc_lo) #1
+    tail call void asm sideeffect "; reg use $0", "{vcc_hi}"(i32 %vcc_hi) #1
+    ret void
+  }
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #2
+attributes #1 = { nounwind }
+attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+!0 = !{}

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
index 1cf3699240d4c9..f59b22d8475c0a 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
@@ -39,6 +39,7 @@
 ; AFTER-PEI-NEXT: occupancy: 5
 ; AFTER-PEI-NEXT: scavengeFI: '%fixed-stack.0'
 ; AFTER-PEI-NEXT: vgprForAGPRCopy: ''
+; AFTER-PEI-NEXT: longBranchReservedReg: ''
 ; AFTER-PEI-NEXT: body:
 define amdgpu_kernel void @scavenge_fi(ptr addrspace(1) %out, i32 %in) #0 {
   %wide.sgpr0 = call <32 x i32>  asm sideeffect "; def $0", "=s" () #0

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
new file mode 100644
index 00000000000000..759ef68329f46b
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
@@ -0,0 +1,120 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -amdgpu-s-branch-bits=4 -stop-after=branch-relaxation -verify-machineinstrs %s -o - | FileCheck %s
+
+; Test that debug instructions do not change long branch reserved serialized through
+; MIR.
+
+; CHECK-LABEL: {{^}}name: uniform_long_forward_branch_debug
+; CHECK: machineFunctionInfo:
+; CHECK-NEXT: explicitKernArgSize: 12
+; CHECK-NEXT: maxKernArgAlign: 8
+; CHECK-NEXT: ldsSize: 0
+; CHECK-NEXT: gdsSize: 0
+; CHECK-NEXT: dynLDSAlign: 1
+; CHECK-NEXT: isEntryFunction: true
+; CHECK-NEXT: noSignedZerosFPMath: false
+; CHECK-NEXT: memoryBound: false
+; CHECK-NEXT: waveLimiter: false
+; CHECK-NEXT: hasSpilledSGPRs: false
+; CHECK-NEXT: hasSpilledVGPRs: false
+; CHECK-NEXT: scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+; CHECK-NEXT: frameOffsetReg:  '$fp_reg'
+; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32'
+; CHECK-NEXT: bytesInStackArgArea: 0
+; CHECK-NEXT: returnsVoid:     true
+; CHECK-NEXT: argumentInfo:
+; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; CHECK-NEXT: workGroupIDX:    { reg: '$sgpr6' }
+; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+; CHECK-NEXT: workItemIDX:     { reg: '$vgpr0' }
+; CHECK-NEXT: psInputAddr:     0
+; CHECK-NEXT: psInputEnable:   0
+; CHECK-NEXT: mode:
+; CHECK-NEXT: ieee:            true
+; CHECK-NEXT: dx10-clamp:      true
+; CHECK-NEXT: fp32-input-denormals: true
+; CHECK-NEXT: fp32-output-denormals: true
+; CHECK-NEXT: fp64-fp16-input-denormals: true
+; CHECK-NEXT: fp64-fp16-output-denormals: true
+; CHECK-NEXT: BitsOf32BitAddress: 0
+; CHECK-NEXT: occupancy:       8
+; CHECK-NEXT: vgprForAGPRCopy: ''
+; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
+; CHECK-NEXT: body:
+  define amdgpu_kernel void @uniform_long_forward_branch_debug(ptr addrspace(1) %arg, i32 %arg1) #0 !dbg !5 {
+  bb0:
+    %uniform_long_forward_branch_debug.kernarg.segment = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr(), !dbg !11
+    %arg1.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %uniform_long_forward_branch_debug.kernarg.segment, i64 8, !dbg !11, !amdgpu.uniform !7
+    %arg1.load = load i32, ptr addrspace(4) %arg1.kernarg.offset, align 8, !dbg !11, !invariant.load !7
+    %tmp = icmp eq i32 %arg1.load, 0, !dbg !11
+    call void @llvm.dbg.value(metadata i1 %tmp, metadata !9, metadata !DIExpression()), !dbg !11
+    br i1 %tmp, label %bb3, label %Flow, !dbg !12, !amdgpu.uniform !7
+
+  Flow:                                             ; preds = %bb3, %bb0
+    %0 = phi i1 [ false, %bb3 ], [ true, %bb0 ], !dbg !12
+    br i1 %0, label %bb2, label %bb4, !dbg !12, !amdgpu.uniform !7
+
+  bb2:                                              ; preds = %Flow
+    store volatile i32 17, ptr addrspace(1) undef, align 4, !dbg !13
+    br label %bb4, !dbg !14, !amdgpu.uniform !7
+
+  bb3:                                              ; preds = %bb0
+    call void asm sideeffect "v_nop_e64\0A  v_nop_e64\0A  v_nop_e64\0A  v_nop_e64", ""(), !dbg !15
+    br label %Flow, !dbg !16, !amdgpu.uniform !7
+
+  bb4:                                              ; preds = %bb2, %Flow
+    %arg.kernarg.offset1 = bitcast ptr addrspace(4) %uniform_long_forward_branch_debug.kernarg.segment to ptr addrspace(4), !dbg !11, !amdgpu.uniform !7
+    %arg.load = load ptr addrspace(1), ptr addrspace(4) %arg.kernarg.offset1, align 16, !dbg !11, !invariant.load !7
+    store volatile i32 63, ptr addrspace(1) %arg.load, align 4, !dbg !17
+    ret void, !dbg !18
+  }
+
+  ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+  declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
+  ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+  declare align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #1
+
+  ; Function Attrs: convergent nocallback nofree nounwind willreturn
+  declare { i1, i64 } @llvm.amdgcn.if.i64(i1) #2
+
+  ; Function Attrs: convergent nocallback nofree nounwind willreturn
+  declare { i1, i64 } @llvm.amdgcn.else.i64.i64(i64) #2
+
+  ; Function Attrs: convergent nocallback nofree nounwind willreturn memory(none)
+  declare i64 @llvm.amdgcn.if.break.i64(i1, i64) #3
+
+  ; Function Attrs: convergent nocallback nofree nounwind willreturn
+  declare i1 @llvm.amdgcn.loop.i64(i64) #2
+
+  ; Function Attrs: convergent nocallback nofree nounwind willreturn
+  declare void @llvm.amdgcn.end.cf.i64(i64) #2
+
+  attributes #0 = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+  attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+  attributes #2 = { convergent nocallback nofree nounwind willreturn }
+  attributes #3 = { convergent nocallback nofree nounwind willreturn memory(none) }
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.debugify = !{!2, !3}
+  !llvm.module.flags = !{!4}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+  !1 = !DIFile(filename: "temp.ll", directory: "/")
+  !2 = !{i32 8}
+  !3 = !{i32 1}
+  !4 = !{i32 2, !"Debug Info Version", i32 3}
+  !5 = distinct !DISubprogram(name: "uniform_long_forward_branch_debug", linkageName: "uniform_long_forward_branch_debug", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8)
+  !6 = !DISubroutineType(types: !7)
+  !7 = !{}
+  !8 = !{!9}
+  !9 = !DILocalVariable(name: "1", scope: !5, file: !1, line: 1, type: !10)
+  !10 = !DIBasicType(name: "ty8", size: 8, encoding: DW_ATE_unsigned)
+  !11 = !DILocation(line: 1, column: 1, scope: !5)
+  !12 = !DILocation(line: 2, column: 1, scope: !5)
+  !13 = !DILocation(line: 3, column: 1, scope: !5)
+  !14 = !DILocation(line: 4, column: 1, scope: !5)
+  !15 = !DILocation(line: 5, column: 1, scope: !5)
+  !16 = !DILocation(line: 6, column: 1, scope: !5)
+  !17 = !DILocation(line: 7, column: 1, scope: !5)
+  !18 = !DILocation(line: 8, column: 1, scope: !5)

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
new file mode 100644
index 00000000000000..436bc2565d044d
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
@@ -0,0 +1,68 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -amdgpu-s-branch-bits=4 -stop-after=branch-relaxation -verify-machineinstrs %s -o - | FileCheck %s
+
+; Test that long branch reserved register is serialized through
+; MIR.
+
+; CHECK-LABEL: {{^}}name: uniform_long_forward_branch
+; CHECK: machineFunctionInfo:
+; CHECK-NEXT: explicitKernArgSize: 12
+; CHECK-NEXT: maxKernArgAlign: 8
+; CHECK-NEXT: ldsSize: 0
+; CHECK-NEXT: gdsSize: 0
+; CHECK-NEXT: dynLDSAlign: 1
+; CHECK-NEXT: isEntryFunction: true
+; CHECK-NEXT: noSignedZerosFPMath: false
+; CHECK-NEXT: memoryBound: false
+; CHECK-NEXT: waveLimiter: false
+; CHECK-NEXT: hasSpilledSGPRs: false
+; CHECK-NEXT: hasSpilledVGPRs: false
+; CHECK-NEXT: scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+; CHECK-NEXT: frameOffsetReg:  '$fp_reg'
+; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32'
+; CHECK-NEXT: bytesInStackArgArea: 0
+; CHECK-NEXT: returnsVoid:     true
+; CHECK-NEXT: argumentInfo:
+; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; CHECK-NEXT: workGroupIDX:    { reg: '$sgpr6' }
+; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+; CHECK-NEXT: workItemIDX:     { reg: '$vgpr0' }
+; CHECK-NEXT: psInputAddr:     0
+; CHECK-NEXT: psInputEnable:   0
+; CHECK-NEXT: mode:
+; CHECK-NEXT: ieee:            true
+; CHECK-NEXT: dx10-clamp:      true
+; CHECK-NEXT: fp32-input-denormals: true
+; CHECK-NEXT: fp32-output-denormals: true
+; CHECK-NEXT: fp64-fp16-input-denormals: true
+; CHECK-NEXT: fp64-fp16-output-denormals: true
+; CHECK-NEXT: BitsOf32BitAddress: 0
+; CHECK-NEXT: occupancy:       8
+; CHECK-NEXT: vgprForAGPRCopy: ''
+; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
+; CHECK-NEXT: body:
+define amdgpu_kernel void @uniform_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) {
+bb0:
+  %tmp = icmp ne i32 %arg1, 0
+  br i1 %tmp, label %bb2, label %bb3
+
+bb2:
+  store volatile i32 17, ptr addrspace(1) undef
+  br label %bb4
+
+bb3:
+  ; 32 byte asm
+  call void asm sideeffect
+  "v_nop_e64
+  v_nop_e64
+  v_nop_e64
+  v_nop_e64", ""() #0
+  br label %bb4
+
+bb4:
+  store volatile i32 63, ptr addrspace(1) %arg
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
index fa8607dc1306cb..91d6445d0adc72 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
@@ -48,6 +48,7 @@
 # FULL-NEXT:  highBitsOf32BitAddress: 0
 # FULL-NEXT:  occupancy: 8
 # FULL-NEXT:  vgprForAGPRCopy: ''
+# FULL-NEXT:  longBranchReservedReg: ''
 # FULL-NEXT: body:
 
 # SIMPLE: machineFunctionInfo:
@@ -148,6 +149,7 @@ body:             |
 # FULL-NEXT:  highBitsOf32BitAddress: 0
 # FULL-NEXT:  occupancy: 8
 # FULL-NEXT: vgprForAGPRCopy: ''
+# FULL-NEXT: longBranchReservedReg: ''
 # FULL-NEXT: body:
 
 # SIMPLE: machineFunctionInfo:
@@ -219,6 +221,7 @@ body:             |
 # FULL-NEXT:  highBitsOf32BitAddress: 0
 # FULL-NEXT:  occupancy: 8
 # FULL-NEXT: vgprForAGPRCopy: ''
+# FULL-NEXT: longBranchReservedReg: ''
 # FULL-NEXT: body:
 
 # SIMPLE: machineFunctionInfo:
@@ -291,6 +294,7 @@ body:             |
 # FULL-NEXT:  highBitsOf32BitAddress: 0
 # FULL-NEXT:  occupancy: 8
 # FULL-NEXT: vgprForAGPRCopy: ''
+# FULL-NEXT: longBranchReservedReg: ''
 # FULL-NEXT: body:
 
 # SIMPLE: machineFunctionInfo:

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
index 7d365faa336f9d..a2f292081cf954 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
@@ -42,6 +42,7 @@
 ; CHECK-NEXT: highBitsOf32BitAddress: 0
 ; CHECK-NEXT: occupancy: 8
 ; CHECK-NEXT: vgprForAGPRCopy: ''
+; CHECK-NEXT: longBranchReservedReg: ''
 ; CHECK-NEXT: body:
 define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
   %gep = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %arg0
@@ -84,6 +85,7 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
 ; CHECK-NEXT: highBitsOf32BitAddress: 0
 ; CHECK-NEXT: occupancy: 10
 ; CHECK-NEXT: vgprForAGPRCopy: ''
+; CHECK-NEXT: longBranchReservedReg: ''
 ; CHECK-NEXT: body:
 define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) {
   %gep = getelementptr inbounds [128 x i32], ptr addrspace(2) @gds, i32 0, i32 %arg0
@@ -150,6 +152,7 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 {
 ; CHECK-NEXT: highBitsOf32BitAddress: 0
 ; CHECK-NEXT: occupancy: 8
 ; CHECK-NEXT: vgprForAGPRCopy: ''
+; CHECK-NEXT: longBranchReservedReg: ''
 ; CHECK-NEXT: body:
 define void @function() {
   ret void
@@ -198,6 +201,7 @@ define void @function() {
 ; CHECK-NEXT: highBitsOf32BitAddress: 0
 ; CHECK-NEXT: occupancy: 8
 ; CHECK-NEXT: vgprForAGPRCopy: ''
+; CHECK-NEXT: longBranchReservedReg: ''
 ; CHECK-NEXT: body:
 define void @function_nsz() #0 {
   ret void